Setup

Load packages

library(data.table)
library(stringr)
library(ggplot2)
theme_py <- theme_light() + theme(
  panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(),
  panel.border = element_rect(colour = "black", fill = NA),
  text = element_text(size=20),
  strip.placement = "outside", 
  strip.text = element_text(size=20, color="black"),
  strip.background = element_rect(fill="white")
)
theme_set(theme_py)
library(patchwork)
library(ggrepel)
library(ComplexHeatmap)
library(DESeq2)
library(consensusSeekeR)
library(BSgenome.jaNemVect1.1.DToL.Assembly)
library(GenomicRanges)
library(parallel)
library(universalmotif)
library(monaLisa)
library(rtracklayer)
source("../motif-analysis/mta_downstream_functions.R")

Directories and colors

dat_dir <- "ATACSEQ/nucleosome_free_regions/"
pks_dir <- file.path(dat_dir, "macs2_peaks")
cns_dir <- file.path(dat_dir, "consensus_peaks")
hom_dir <- file.path(dat_dir, "homer")
ftp_dir <- file.path(dat_dir, "footprint")
res_dir <- file.path(dat_dir, "results")
fig_dir <- file.path(dat_dir, "plots")
for (newdir in c(cns_dir, hom_dir, res_dir, fig_dir))
  dir.create(newdir, showWarnings = FALSE)

# colors
condition_cols <- c("positive" = "blue", "negative" = "red")
line_cols = c("Elav" = "#ff7f00", "Fox" = "#984ea3", "Ncol" = "#4daf4a")

Peaks counts

Find consensus set of peaks

require(consensusSeekeR)
require(BSgenome.jaNemVect1.1.DToL.Assembly)
require(parallel)

# load peaks
pks_files <- list.files(pks_dir, pattern="narrowPeak", recursive=FALSE, full.names=TRUE)
names(pks_files) <- str_remove(basename(pks_files), ".mLb.clN.ncfree_peaks.narrowPeak")
nP_list <- lapply(names(pks_files), function(x) {
    nP <- readNarrowPeakFile(pks_files[x], extractRegions = TRUE, extractPeaks = TRUE)
    names(nP$narrowPeak) <- rep(x, length(nP$narrowPeak))
    names(nP$peak) <- rep(x, length(nP$peak))
    nP
})
regions <- GenomicRanges::GRangesList(lapply(nP_list, function(x) x$narrowPeak))
peaks <- GenomicRanges::GRangesList(lapply(nP_list, function(x) x$peak))
names(regions) <- names(pks_files)
names(peaks) <- names(pks_files)

# get consensus
chrList <- Seqinfo(
    seqnames = seqnames(BSgenome.jaNemVect1.1.DToL.Assembly),
    seqlengths = seqlengths(BSgenome.jaNemVect1.1.DToL.Assembly),
    isCircular = c(rep(FALSE, length(seqnames(BSgenome.jaNemVect1.1.DToL.Assembly))-1), TRUE),
    genome = "jaNemVect1.1"
)
message(Sys.time(), " Started calculating consensus")
ur <- unlist(regions)
up <- unlist(peaks)

# debugging
# id <- seqnames(ur)=="NC_064035.1" & start(ur)>1644829 & end(ur)<1649211
# ur <- ur[id]
# up <- up[id]
# chrList <- chrList["NC_064035.1"]

results <- findConsensusPeakRegions(
    narrowPeaks = ur,
    peaks = up,
    chrInfo = chrList,
    extendingSize = 250,
    expandToFitPeakRegion = FALSE,
    shrinkToFitPeakRegion = FALSE,
    minNbrExp = 2,
    nbrThreads = detectCores()-1
)
message(Sys.time(), " Done calculating consensus")
saveRDS(results, file.path(cns_dir,"consensusSeekeR-results.RDS"))

# resize peaks
pks_cns <- results$consensusRanges
pks_mid <- start(pks_cns) + (end(pks_cns)-start(pks_cns))/2
ranges(pks_cns) <- IRanges(pks_mid,width=0)
pks_scl <- promoters(pks_cns, upstream=125, downstream=125)

# trim out-of-bound peaks
seqlengths(pks_scl) <- seqlengths(BSgenome.jaNemVect1.1.DToL.Assembly)[names(seqlengths(pks_scl))]
pks_scl <- trim(pks_scl)

# save bed file
pks_bed <- as.data.table(pks_scl)
pks_bed[,name:=paste0("peak",1:.N)]
pks_bed[,width:=NULL][,score:="."]
setcolorder(pks_bed, c("seqnames", "start", "end", "name", "score", "strand"))
fwrite(pks_bed, file.path(cns_dir,"consensusSeekeR-peaks.bed"), sep="\t", col.names=FALSE)

Get scores for consensus peaks in all samples.

nth=12
res="ATACSEQ/nucleosome_free_regions/consensus_peaks/consensusSeekeR-peaks-counts.tsv"
bed="ATACSEQ/nucleosome_free_regions/consensus_peaks/consensusSeekeR-peaks.bed"
bam=$( echo ATACSEQ/nucleosome_free_regions/bam/*R*bam )

cols=$( echo -e seqnames start end name score strand | column -t )
for b in $bam
do 
  echo $b
  name=$( basename $b)
  cols=$( echo -e ${cols} ${name} | column -t )
done

echo -e ${cols} > ${res%%tsv}txt
bedtools multicov -bams ${bam} -bed ${bed} > ${res}

Data for differential peaks analysis

# load counts in peaks
pks_ct <- fread(file.path(cns_dir, "consensusSeekeR-peaks-counts.tsv"), header=FALSE)
colnames <- readLines(file.path(cns_dir, "consensusSeekeR-peaks-counts.txt"), n=1)
colnames <- str_remove_all(colnames, ".mLb.clN.ncfree.sorted.bam")
colnames <- str_split(colnames, " ")[[1]]
colnames(pks_ct) <- colnames

# column data
col_dt <- data.table(sample = colnames(pks_ct)[7:28])
col_dt[,reporterline := str_extract(sample,"Elav|Fox|Ncol")]
col_dt[,reporterline := factor(reporterline, levels=c("Elav","Fox","Ncol"))]
col_dt[,condition := str_extract(sample,"pos|neg")]
col_dt[,condition := str_replace_all(condition,c("pos"="positive","neg"="negative"))]
col_dt[,condition := factor(condition, levels = c("negative","positive"))]
col_dt[,group := paste(as.character(reporterline), as.character(condition), sep=""), by=1:nrow(col_dt)]
col_dt[,group := factor(group)]
fwrite(col_dt, file.path(res_dir, "design.tsv"), sep="\t")

col_df <- copy(col_dt)
class(col_df) <- "data.frame"
rownames(col_df) <- col_df$sample

# counts matrix
pks_mt <- as.matrix(pks_ct[, -c(1:6)])
rownames(pks_mt) <- pks_ct$name
pks_mt <- pks_mt[, rownames(col_df)]
write.table(
  pks_mt,
  file.path(res_dir, "mat.tsv"),
  sep = "\t",
  quote = FALSE
)

# DESeq2
require(DESeq2)
dds <- DESeqDataSetFromMatrix(
  countData = pks_mt,
  colData = col_df,
  design = ~ condition + reporterline
)
saveRDS(dds, file.path(res_dir, "dds.rds"))

Peaks counts distributions

# normalized accessibility distribution
pks_dt <- as.data.table(pks_mt, keep.rownames = "peak")
pks_dt <- melt.data.table(
  pks_dt,
  id.vars = "peak",
  variable.name = "sample",
  value.name = "norm_counts"
)
pks_dt <- merge.data.table(pks_dt, col_dt, by="sample")
setorder(pks_dt, peak, condition, reporterline)
pks_dt[,sample:=factor(sample, levels = unique(pks_dt$sample))]

gp_acc <- ggplot(pks_dt, aes(sample, log10(norm_counts), fill=reporterline)) +
  geom_violin(scale = "width", alpha = 0.8, color = "black") +
  geom_boxplot(width = 0.5, outlier.shape = NA, alpha = 0.8, color = "black") + 
  scale_fill_manual(values = line_cols, limits = force) +
  scale_y_continuous(expand = expansion(mult = c(0,0.01))) +
  labs(x="samples", y="peak\naccessibility") +
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
    legend.title = element_blank(), legend.position = "none"
  )

var_dt <- pks_dt[,.(var=var(norm_counts)),.(peak)]
gp_var <- ggplot(var_dt, aes(log10(var))) + 
  geom_density() +
  scale_x_continuous(limits=c(-4,NA)) +
  scale_y_continuous(expand = expansion(mult = c(0,0.01))) +
  labs(x = "log10(accessibility variance)")

gp_pch <- gp_acc / gp_var + plot_layout(heights = c(2,1))
gp_pch

Use normalized log-transformed accessibility data

# normalize samples
dds <- readRDS(file.path(res_dir, "dds.rds"))
dds <- estimateSizeFactors(dds)
norm_mt <- counts(dds, normalized=TRUE)

# save
write.table(
  norm_mt,
  file.path(res_dir, "mat_norm.tsv"),
  sep="\t",
  row.names = TRUE,
  quote = FALSE
)

# row normalize to bring peaks to same range
norm_mt <- (norm_mt+10)/apply(norm_mt+10,1,median) 
norm_mt <- log2(norm_mt)

# save
write.table(
  norm_mt,
  file.path(res_dir, "mat_norm_fc.tsv"),
  sep="\t",
  row.names = TRUE,
  quote = FALSE
)

PCA

PCA on all samples.

set.seed(1950)
pca_res <- prcomp(t(norm_mt), center = TRUE)

# variance explained
pca_var <- data.table(pct_var = round(((pca_res$sdev) ^ 2 / sum((pca_res$sdev) ^ 2)* 100), 2))
pca_var[,pct_cum:=cumsum(pct_var)]
pca_var[,PC:=factor(1:.N)]
gp_var <- ggplot(pca_var, aes(PC, pct_var)) + 
  geom_bar(stat = "identity") +
  geom_line(aes(y = pct_cum, group = 1)) + 
  geom_point(aes(y = pct_cum)) +
  scale_y_continuous(expand = expansion(0.01,0)) +
  labs(y = "% of variance\nexplained", x = "PC") +
  theme(panel.grid.major.y = element_line(size = 0.5))

pca_dt <- as.data.table(pca_res$x, keep.rownames = "sample")
pca_dt <- merge.data.table(col_dt, pca_dt, by="sample", sort=FALSE)
gp_bip <- ggplot(pca_dt, aes(PC1, PC2, fill=reporterline, shape=condition)) + 
  geom_point(size=5) +
  scale_fill_manual(values = line_cols) +
  scale_shape_manual(values = c("positive" = 21, "negative" = 24)) +
  guides(fill = guide_legend(override.aes=list(shape=21))) +
  geom_text_repel(aes(label = sample))

gp_pch <- gp_var / gp_bip 
gp_pch

PCA per cell line

set.seed(1950)

gp_l <- lapply(names(line_cols), function(cl) {
  
  pca_res <- prcomp(t(norm_mt[,grep(cl,colnames(norm_mt))]), center = TRUE)
  
  # variance explained
  pca_var <- data.table(pct_var = round(((pca_res$sdev) ^ 2 / sum((pca_res$sdev) ^ 2)* 100), 2))
  pca_var <- pca_var[-nrow(pca_var)]
  pca_var[,pct_cum:=cumsum(pct_var)]
  pca_var[,PC:=factor(1:.N-1)]
  gp_var <- ggplot(pca_var, aes(PC, pct_var)) + 
    geom_bar(stat = "identity") +
    geom_line(aes(y = pct_cum, group = 1)) + 
    geom_point(aes(y = pct_cum)) +
    scale_y_continuous(expand = expansion(0.01,0)) +
    labs(y = "% of variance\nexplained", x = "PC") +
    theme(panel.grid.major.y = element_line(size = 0.5))
  
  # pca plot
  pca_dt <- as.data.table(pca_res$x, keep.rownames = "sample")
  pca_dt <- merge.data.table(col_dt, pca_dt, by="sample", sort=FALSE)
  gp_bip <- ggplot(pca_dt, aes(PC1, PC2, fill=condition, shape=condition)) + 
    geom_point(size=5) +
    scale_shape_manual(values = c("positive" = 21, "negative" = 24)) +
    scale_fill_manual(values = condition_cols) +
    guides(fill = guide_legend(override.aes=list(shape=21))) +
    geom_text_repel(aes(label = sample))
  
  gp_var / gp_bip 

})
gp_l
## [[1]]

## 
## [[2]]

## 
## [[3]]

Marker peaks

Use normalized peak counts

norm_mt <- read.table(
  file.path(res_dir, "mat_norm_fc.tsv"),
  header = TRUE
)

Identify marker peaks

# gene markers by high FC + significant DEseq2 LTR test 
dds <- readRDS(file.path(res_dir, "dds.rds"))
dds <- DESeq(dds, test="LRT", reduced=~1)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
dds_res <- results(dds)
dds_qval <- dds_res$padj 
names(dds_qval) <- rownames(dds_res)
peaks_deseq <- names(which(dds_qval<1e-2))
peaks_high <- names(which(apply(
  norm_mt, 1, function(x) sort(x, decreasing = TRUE)[2] >= 1.8
)))
peaks_marks <- intersect(peaks_high, peaks_deseq)
length(peaks_marks)
## [1] 2844

Cluster peaks

set.seed(1950)

# determine k for kmeans
ks <- 1:30
tot_withinss <- sapply(ks,  function(k) {
  cl <- kmeans(norm_mt[peaks_marks,], k)
  cl$tot.withinss
})
elbow_dt <- data.table(k = ks, tot_withinss = tot_withinss)
elbow_gp <- ggplot(elbow_dt, aes(x = k, y = tot_withinss)) +
  geom_line() + geom_point()+
  scale_x_continuous(breaks = ks)
elbow_gp

Cluster peaks

# kmeans
set.seed(1950)
k <- 19
cl <- kmeans(norm_mt[peaks_marks,], k)
peaks_order_list <- tapply(names(cl$cluster), cl$cluster, function(gs) {
  cor_peaks <- cor(t(norm_mt[gs,]))
  hclust_peaks <- hclust(as.dist(
    1 - cor(cor_peaks)),
    method = "ward.D2"
  )
  rownames(cor_peaks)[hclust_peaks$order]
})
names(peaks_order_list) <- unique(cl$cluster)
peaks_order_list <- peaks_order_list[as.character(seq_along(peaks_order_list))]

# cluster clusters
cluster_order <- hclust(dist(
  cor(t(cl$centers)), method = "euclidean"
), method = "ward.D2")$order
peaks_order_list <- peaks_order_list[as.character(cluster_order)]
peaks_order <- unname(unlist(peaks_order_list))
clusters_dt <- data.table(
  peak = peaks_order,
  clusters = as.character(rep(
    names(peaks_order_list),
    sapply(peaks_order_list, length)
  ))
)

# group clusters (manually)
clusters_dt[clusters %in% c(16), group :=1 ] # Elav
clusters_dt[clusters %in% c(5), group :=2 ] # Elav + Fox
clusters_dt[clusters %in% c(14,17), group := 3] # Fox
clusters_dt[clusters %in% c(6,11), group := 4] # Fox + Ncol
clusters_dt[clusters %in% c(9), group := 5]
clusters_dt[clusters %in% c(13,4), group := 6] # Elav + Ncol
clusters_dt[clusters %in% c(1,2,15,7,8,10,19,18), group := 7] # Ncol
clusters_dt[clusters %in% c(12), group := 8]
clusters_dt[clusters %in% c(3), group := 9]
setorder(clusters_dt, group)
peaks_order <- clusters_dt$peak

# add clusters info to peaks coordinates bed file
pks_bed <- fread(file.path(cns_dir, "consensusSeekeR-peaks.bed"))
setnames(pks_bed, c("V4"), c("peak"))
clusters_bed <- merge.data.table(
  pks_bed, clusters_dt,
  by = "peak",
  sort = FALSE
)
setcolorder(clusters_bed, c(colnames(pks_bed)))
clusters_bed[, clusters := paste0("C", clusters)]
clusters_bed[, group := paste0("G", group)]
fwrite(
  clusters_bed,
  file.path(res_dir, "consensusSeekeR-peaks-clusters.bed"),
  sep = "\t",
  col.names = FALSE
)

Heatmap of markers

# order rows and columns
samples_order <- col_dt[order(condition,reporterline)]$sample
plot_mt <- norm_mt[peaks_order,samples_order]

# center at 0
plot_min <- quantile(abs(range(plot_mt)),0.75)
plot_min <- 5
plot_mt <- pmin(pmax(plot_mt,-plot_min),plot_min)

# heatmap colors
col_vec <- colorRampPalette(
  RColorBrewer::brewer.pal(11,'BrBG')
)(1000)
col_fun <- circlize::colorRamp2(
  seq(-plot_min, plot_min, length.out = length(col_vec)),
  col_vec
)

# color annotaitons
col_ann <- HeatmapAnnotation(
    which = "column", border = TRUE,
    "reporterline" = as.character(
      col_dt[match(colnames(plot_mt),sample)]$reporterline
    ),
    "condition" = as.character(
      col_dt[match(colnames(plot_mt),sample)]$condition
    ), 
    col = list(
      "reporterline" = line_cols,
      "condition" = condition_cols
    )
)

# # peak module annotations (clusters)
# clann <- clusters_dt[match(rownames(plot_mt),peak)]$clusters
# clann_lab <- unique(clusters_dt[match(rownames(plot_mt),peak)]$clusters)
# clann <- factor(clann, levels=clann_lab)
# module_ann <- HeatmapAnnotation(
#     which = "row", border = TRUE,
#     "cluster" = anno_block(
#       labels = clann_lab,
#       gp = gpar(col=NA)
#     )
# )
# row_split <- clann

# peak module annotations (manually groupped clusters)
grann <- clusters_dt[match(rownames(plot_mt),peak)]$group
grann_lab <- unique(clusters_dt[match(rownames(plot_mt),peak)]$group)
grann <- factor(grann, levels=grann_lab)
module_ann <- HeatmapAnnotation(
    which = "row", border = TRUE,
    "group" = anno_block(
      labels = grann_lab,
      gp = gpar(col=NA)
    )
)
row_split <- grann


# peaks annotations
row_labels_marks_ids <- match(
  clusters_dt[,.SD[1],clusters]$peak,
  rownames(plot_mt)
)
row_labels_marks <- clusters_dt[,.SD[1], clusters]$clusters
mark_ann <- HeatmapAnnotation(
  which = "row",
  marker = anno_mark(at = row_labels_marks_ids, labels = row_labels_marks),
  show_legend = FALSE
)
mark_ann <- HeatmapAnnotation(
    which = "row", border = TRUE,
    "padj<0.05" = rownames(plot_mt) %in% peaks_deseq,
    "var>1" = rownames(plot_mt) %in% peaks_vari,
    "FC>2" = rownames(plot_mt) %in% peaks_fc,
    col = list(
      "padj<0.05" = c("TRUE" = "#e6ab02", "FALSE"="#d9d9d9"),
      "var>1" = c("TRUE" = "#3690c0", "FALSE"="#d9d9d9"),
      "FC>2" = c("TRUE" = "#e7298a", "FALSE"="#d9d9d9")
    )
)

# heatmap
hm <- Heatmap(
  plot_mt, name = "normalized\naccessibility",
  col = col_fun,
  cluster_rows = FALSE, cluster_columns = FALSE,
  show_row_names = FALSE, show_column_names = TRUE,
  row_title = "peaks", 
  row_split = row_split, 
  cluster_row_slices = FALSE,
  top_annotation = col_ann,  
  left_annotation = module_ann, right_annotation = mark_ann,
  border = TRUE
)
hm

Peak to gene assignment

Load data

# peaks
peaks <- fread(file.path(cns_dir, "consensusSeekeR-peaks.bed"))
setnames(peaks, c("seqnames", "start", "end", "name", "score", "strand"))
peaks_gr <- makeGRangesFromDataFrame(peaks, keep.extra.columns = TRUE)

# genes
genes_gr <- rtracklayer::import(
  "annotation/Nvec_v4_merged_annotation_sort.gtf",
  "gtf"
)
genes_gr <- genes_gr[genes_gr$type == "transcript"]
genes_gr$name <- genes_gr$transcript_id

# non-expressed genes to remove
counts <- read.table(
  "RNASEQ_QUANTIFICATION/raw_counts_rnaseq.tsv",
  header = TRUE,
  row.names = 1
)
exclude_genes <- names(which(
  apply(counts, 1, function(x) sum(x) < 10)
))
length(exclude_genes) # 437
## [1] 437

Assign peaks to genes

# assign
assign <- mta_match_peaks_to_genes(
  gff_object = genes_gr,
  peak_object = peaks_gr,
  index_object = "genome/Nvec_vc1.1_gDNA.fasta.fai",
  list_genes = NULL,
  feature_to_match = "transcript",
  feature_field = "name",
  exclude_genes = NULL,
  max_tss_dist = 10000,
  min_overlap = 0,
  max_gap = 1,
  promoter_upstream = 200,
  promoter_downstream = 50,
  promoter_object = NULL
)
setDT(assign)
setnames(assign, "chr", "seqnames")
assign[, seqnames := factor(seqnames, levels = unique(peaks$seqnames))]
setorder(assign, seqnames, start, end)

# save
fwrite(
  assign,
  file.path(res_dir, "consensusSeekeR-peaks-gene-assignment.tsv"),
  sep = "\t",
  col.names = TRUE
)

Inspect peak assignment results

assign <- fread(file.path(res_dir, "consensusSeekeR-peaks-gene-assignment.tsv"))

max_assign_peaks_per_gene <- unique(
  assign[, .(peak,gene)][!is.na(gene)]
)[, .N, gene]
gb1 <- ggplot(max_assign_peaks_per_gene, aes(N)) + 
  geom_bar() +
  labs(x = "peaks per gene", y = "genes") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.01))) +
  scale_x_continuous(
    breaks = c(1, seq(5, max(max_assign_peaks_per_gene$N), 5))
  )

max_assign_gene_per_peak <- unique(
  assign[, .(peak,gene)][!is.na(gene)]
)[, .N, peak]
gb2 <- ggplot(max_assign_gene_per_peak, aes(N)) +
  geom_bar() +
  labs(x = "genes per peak", y = "peaks") +
  scale_y_continuous(
    expand = expansion(mult = c(0, 0.01)),
    labels = scales::label_number(scale = 0.001, suffix = "K")
  ) +
  scale_x_continuous(breaks = seq(1, max(max_assign_peaks_per_gene$N)))

gb1 + gb2

Calculate gene accessibility scores as weighted sum of normalized peak scores assigned to genes.

assign <- fread(file.path(res_dir, "consensusSeekeR-peaks-gene-assignment.tsv"))
setnames(assign, "seqnames", "chr")

genes_gff <- genes_gr#[!genes_gr$name %in% exclude_genes]
genes_gff$symbol <- genes_gff$name

peaks_mt <- read.table(file.path(res_dir, "mat_norm.tsv"))

# cells_groups <- fread(file.path(res_dir, "design.tsv"))[, .(sample, group)]
# setnames(cells_groups, "sample", "cell")

gscore <- mta_gene_scores(
  genes_peaks_table = assign,
  gff_object = genes_gff,
  peak_object = peaks_gr,
  peaks_mat = peaks_mt
)

saveRDS(gscore, file.path(res_dir, "consensusSeekeR-gene-scores.rds"))
write.table(
  gscore$genes_scores_matrix,
  file.path(res_dir, "consensusSeekeR-gene-scores.tsv"),
  sep = "\t",
  col.names = TRUE
)

For DESeq2, calculate scores as sums of raw peak counts (they have to be raw counts and integers).

assign <- fread(file.path(res_dir, "consensusSeekeR-peaks-gene-assignment.tsv"))
setnames(assign, "seqnames", "chr")

genes_gff <- genes_gr#[!genes_gr$name %in% exclude_genes]
genes_gff$symbol <- genes_gff$name

peaks_mt <- read.table(file.path(res_dir, "mat.tsv"))

gscore <- mta_gene_scores(
  genes_peaks_table = assign,
  gff_object = genes_gff,
  peak_object = peaks_gr,
  peaks_mat = peaks_mt,
  weight_peaks = FALSE
)

saveRDS(gscore, file.path(res_dir, "consensusSeekeR-gene-scores-raw.rds"))
write.table(
  gscore$genes_scores_matrix,
  file.path(res_dir, "consensusSeekeR-gene-scores-raw.tsv"),
  sep = "\t",
  col.names = TRUE
)

Use normalized gene scores

# load gene scores
norm_mt <- read.table(
  file.path(res_dir, "consensusSeekeR-gene-scores.tsv"),
  header = TRUE
)
# row normalize to bring genes to same range
norm_mt <- (norm_mt + 10) / apply(norm_mt + 10, 1, median) 
norm_mt <- log2(norm_mt)

Identify marker genes (here use raw gene scores for DESeq2)

# gene markers by normalized expression
genes_fc <- names(which(apply(norm_mt, 1, function(x)
  sort(x, decreasing = TRUE)[1] >= 2
)))
genes_vari <- names(which(apply(norm_mt, 1, function(x)
  var(x) > 1
)))
con_mt <- read.table(
  file.path(res_dir, "consensusSeekeR-gene-scores-raw.tsv"),
)

# gene markers by high FC + significant DEseq2 LTR test
dds <- DESeqDataSetFromMatrix(
  countData = con_mt,
  colData = col_df,
  design = ~ condition + reporterline + condition:reporterline
)
dds <- DESeq(dds, test = "LRT", reduced = ~ 1)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
dds_res <- results(dds)
dds_qval <- dds_res$padj
names(dds_qval) <- rownames(dds_res)
genes_deseq <- names(which(dds_qval < 1e-2))

genes_high <- names(which(apply(norm_mt, 1, function (x)
  sort(x, decreasing = TRUE)[1] > 1.8
)))

gene_marks <- intersect(genes_high, genes_deseq)

# inclusde golden markers
marks_gold <- fread(
  file.path("annotation", "golden-marks-230116.tsv"),
  fill = TRUE
)[, 1:2]
setnames(marks_gold, c("gene", "name"))
marks_gold[, name := str_remove(name, "^Nv")]
gene_marks <- unique(c(gene_marks, marks_gold[gene %in% genes_deseq]$gene))

Select number of clusters for genes

set.seed(1950)

# determine k for kmeans
ks <- 1:30
tot_withinss <- sapply(ks,  function(k) {
  cl <- kmeans(norm_mt[gene_marks, ], k)
  cl$tot.withinss
})
elbow_dt <- data.table(k = ks, tot_withinss = tot_withinss)
elbow_gp <- ggplot(elbow_dt, aes(x = k, y = tot_withinss)) +
  geom_line() + geom_point() +
  scale_x_continuous(breaks = ks) + 
  theme(panel.grid.major = element_line(size = 0.5))
elbow_gp

Cluster genes

set.seed(1950)

# kmeans
k <- 18
cl <- kmeans(norm_mt[gene_marks, ], k)
gene_order_list <- tapply(names(cl$cluster), cl$cluster, function(gs) {
  cor_genes <- cor(t(norm_mt[gs,]))
  hclust_genes <- hclust(as.dist(1 - cor(cor_genes)), method = "ward.D2")
  rownames(cor_genes)[hclust_genes$order]
})
names(gene_order_list) <- unique(cl$cluster)
gene_order_list <- gene_order_list[as.character(seq_along(gene_order_list))]

# cluster clusters
cluster_order <- hclust(
  dist(cor(t(cl$centers)),
  method = "euclidean"),
  method = "ward.D2"
)$order
gene_order_list <- gene_order_list[as.character(cluster_order)]
gene_order <- unname(unlist(gene_order_list[cluster_order]))
clusters_dt <- data.table(
  gene = unlist(gene_order_list),
  clusters = as.character(
    rep(names(gene_order_list), sapply(gene_order_list, length))
  )
)

# group clusters (manually)
clusters_dt[clusters %in% c(8, 15), group := 1] # Elav
clusters_dt[clusters %in% c(1, 4), group := 2] # Elav + Fox
clusters_dt[clusters %in% c(16, 2), group := 3] # Fox
clusters_dt[clusters %in% c(3), group := 4] # Fox + Ncol
clusters_dt[clusters %in% c(13), group := 5] # Fox + Ncol + Elav
clusters_dt[clusters %in% c(17, 5), group := 6] # Elav + Ncol
clusters_dt[clusters %in% c(7, 14, 6, 10, 11, 9), group := 7] # Ncol
clusters_dt[clusters %in% c(12), group := 8]
clusters_dt[clusters %in% c(18), group := 9]
setorder(clusters_dt, group)
gene_order <- clusters_dt$gene

# save
fwrite(
  clusters_dt,
  file.path(res_dir, "genes_clusters.tsv"),
  sep = "\t"
)

Heatmap of markers

# order rows and columns
col_dt <- fread(file.path(res_dir, "design.tsv"))
samples_order <- col_dt[order(condition, reporterline)]$sample
plot_mt <- norm_mt[gene_order, samples_order]

# center at 0
plot_min <- quantile(abs(range(plot_mt)), 0.75)
plot_min <- 4
plot_mt <- pmin(pmax(plot_mt, -plot_min), plot_min)

# heatmap colors
col_vec <- colorRampPalette(RColorBrewer::brewer.pal(11, 'BrBG'))(1000)
col_fun <- circlize::colorRamp2(
  seq(-plot_min, plot_min, length.out = length(col_vec)),
  col_vec
)

# color annotations
col_ann <- HeatmapAnnotation(
    which = "column",
    border = TRUE,
    "reporterline" = as.character(
      col_dt[match(colnames(plot_mt), sample)]$reporterline
    ),
    "condition" = as.character(
      col_dt[match(colnames(plot_mt), sample)]$condition
    ),
    col = list("reporterline" = line_cols, "condition" = condition_cols)
)

# gene module annotations
clann <- clusters_dt[match(rownames(plot_mt), gene)]$clusters
clann_lab <- unique(clusters_dt[match(rownames(plot_mt), gene)]$clusters)
clann <- factor(clann, levels = clann_lab)
module_ann <- HeatmapAnnotation(
    which = "row", border = TRUE,
    "cluster" = anno_block(
      labels = clann_lab,
      gp = gpar(col = NA)
    )
)
row_split <- clann

# gene module annotations (manual groups)
grann <- clusters_dt[match(rownames(plot_mt), gene)]$group
grann_lab <- unique(clusters_dt[match(rownames(plot_mt), gene)]$group)
grann <- factor(grann, levels = grann_lab)
module_ann <- HeatmapAnnotation(
    which = "row", border = TRUE,
    "group" = anno_block(
      labels = grann_lab,
      gp = gpar(col = NA)
    )
)
row_split <- grann

# genes annotations
marks_gold <- fread(
  file.path("annotation", "golden-marks-230116.tsv"),
  fill = TRUE
)[, 1:2]
setnames(marks_gold, c("gene", "name"))
marks_gold[, name := str_remove(name, "^Nv")]

tfs_annt <- fread(
  file.path("annotation", "curated_TFh_Nvec_DToL_names.tsv"),
  header = FALSE
)
setnames(tfs_annt, c("gene", "pfam", "og"))

marks_tfs <- merge.data.table(
  tfs_annt,
  marks_gold,
  by = "gene", all = TRUE, sort = FALSE
)
marks_tfs[is.na(marks_tfs)] <- ""
marks_tfs <- marks_tfs[gene %in% rownames(plot_mt)]
marks_tfs[
  gene %in% clusters_dt[group==5]$gene,
  name := str_replace_all(og, c(
    "HMGbox_Sox.HG1.25:like:BHMG1/SOX1/SOX2/SOX3/SOX14/SOX15/SOX21/SRY:likeclu:18/26/28" = "SOX1/2/3/14/15/21-like",
    "zf-C2H2.HG5.17:PRDM6" = "zf-C2H2 PRDM6",
    "zf-C2H2.Unclassified" = ""
  ))
]
marks_tfs <- marks_tfs[name != ""]

row_labels_marks_ids <- match(marks_tfs$gene, rownames(plot_mt))
row_labels_marks <- marks_tfs[
  match(rownames(plot_mt)[row_labels_marks_ids], gene)
]$name
mark_ann <- HeatmapAnnotation(
    which = "row", border = TRUE,
    "padj<0.01" = rownames(plot_mt) %in% genes_deseq,
    "var>1" = rownames(plot_mt) %in% genes_vari,
    "FC>2" = rownames(plot_mt) %in% genes_fc,
    "marker" = anno_mark(
      at = row_labels_marks_ids,
      labels = row_labels_marks
    ),
    col = list(
      "padj<0.01" = c("TRUE" = "#e6ab02", "FALSE"="#d9d9d9"),
      "var>1" = c("TRUE" = "#3690c0", "FALSE"="#d9d9d9"),
      "FC>2" = c("TRUE" = "#e7298a", "FALSE"="#d9d9d9")
    )
)

# heatmap
hm <- Heatmap(
  plot_mt, name = "normalized\naccessibility",
  col = col_fun,
  cluster_rows = FALSE, cluster_columns = FALSE,
  show_row_names = FALSE, show_column_names = TRUE,
  row_title = "genes",
  row_split = row_split,
  cluster_row_slices = FALSE,
  top_annotation = col_ann, 
  right_annotation = mark_ann,
  left_annotation = module_ann,
  border = TRUE
)
hm

Motif discovery

Save per-group foreground and background peaks for motif discovery

dir.create(file.path(hom_dir, "peaks"))
dir.create(file.path(hom_dir, "results"))
clusters_bed <- fread(file.path(res_dir, "consensusSeekeR-peaks-clusters.bed"))
setnames(clusters_bed, c("seqnames","start","end","name","score","strand","cluster","group"))
clusters_bed[,group:=factor(group, levels=paste0("G",seq_along(unique(group))))]
for (ci in levels(clusters_bed$group)) {
    fg <- clusters_bed[group==ci]
    bg <- clusters_bed[group!=ci]
    fwrite(fg, file.path(hom_dir, "peaks",sprintf("peaks-%s-fg.bed",ci)), col.names=FALSE, sep="\t")
    fwrite(bg, file.path(hom_dir, "peaks",sprintf("peaks-%s-bg.bed",ci)), col.names=FALSE, sep="\t")
}

De novo and known motifs in groups of accessible peaks


findMotifs(){
    genome="genome/Nvec_vc1.1_gDNA.fasta"
    homdir="ATACSEQ/nucleosome_free_regions/homer"
    bedfg=${homdir}"/peaks/peaks-G"${1}"-fg.bed"
    bedbg=${homdir}"/peaks/peaks-G"${1}"-bg.bed"
    outdir=$homdir"/results/G${1}-Homer"
    echo ""
    echo "Starting HOMER analysis for" $1
    echo "using the intervals in" $bedfg
    echo "Output will be saved to" $outdir
    findMotifsGenome.pl $bedfg $genome $outdir -size 250 -len 6,8,10,12 -bg $bedbg 
    echo ""
    echo "Finished de novo analysis for module" $1
}

for cluster in {1..9}
do
findMotifs "$cluster" &
    done
wait
echo "Done."

Parse de novo Homer results

dir="./ATACSEQ/nucleosome_free_regions/homer/results/"
modules=$( find ${dir} -maxdepth 2 -iname "*-Homer" -printf "%f\n" )
for i in $modules
do 
  less ${dir}/${i}/homerMotifs.all.motifs | grep ">" > tmp.txt
  awk -v m="$M" 'BEGIN {FS="\t"} {OFS="\t"} {print m, $0}' tmp.txt >> ${dir}/${i}/homerResults.txt
done

Parse all Homer results

clusters_bed <- fread(file.path(res_dir, "consensusSeekeR-peaks-clusters.bed"))
setnames(
  clusters_bed, 
  c("seqnames", "start", "end", "name", "score", "strand", "cluster", "group")
)
clusters_bed[,group:=factor(group, levels=paste0("G",seq_along(unique(group))))]

ParseHomerDenovo <- function(fn, eta=1) {
  
  # parse homerResults.txt
  dt <- fread(fn, sep="\t", header=FALSE)[,-1]
  setnames(dt,c("consensus","name","logodds_threshold","logpval","0","occurence","stats"))
  dt[,consensus:=stringr::str_remove(consensus,">")]
  dt[,c("fg","bg","pval"):=tstrsplit(occurence,",")]
  dt_occurence <- dt[,lapply(.SD,stringr::str_remove,pattern="[TBP]:"),.SDcols=c("fg","bg","pval")]
  dt_perc <- dt_occurence[,lapply(.SD,function(x) as.numeric(stringr::str_extract(x,"(?<=\\()\\d+\\.*\\d+"))/100),.SDcols=c("fg","bg")]
  setnames(dt_perc,c("fg","bg"),c("fg_perc","bg_perc"))
  dt_counts <- dt_occurence[,lapply(.SD,function(x) as.numeric(stringr::str_extract(x,"\\d+\\.*\\d*"))),.SDcols=c("fg","bg")]
  setnames(dt_counts,c("fg","bg"),c("fg_count","bg_count"))
  dt_occurence[,c("fg","bg"):=NULL]
  dt_occurence[,pval:=as.numeric(pval)]
  dt[,c("fg","bg","pval","occurence"):=NULL]
  dt <- cbind(dt,dt_perc,dt_counts,dt_occurence)
  dt$qval <- p.adjust(dt$pval, method = "BH")
  dt[,fc:=(fg_count+eta)/(bg_count+eta)]
  
  # parse homerResults folder to get names of best hits for denovo motifs
  hd <- str_remove(fn, ".txt")
  ms <- list.files(hd, pattern = "motif\\d+.motif", full.names=TRUE)
  ml <- lapply(ms, function(x) {
    m <- universalmotif::read_homer(x)
    m@name
  })
  ml <- unlist(unname(ml))
  names(ml) <- str_extract(ml, ".+(?=\\,BestGuess)")
  names(ml) <- ml
  dt[,name2:=ml[name]]
  dt[!is.na(name2), name:=name2]
  dt[,name2:=NULL]
  
  # return
  dt[,.(name,consensus,fc,pval,logpval,qval,fg_count,fg_perc,bg_count,bg_perc)]
}

ParseHomerKnown <- function(fn, eta=1) {
  dt <- fread(fn)
  setnames(dt, c("name","consensus","pval","logpval","qval","fg_count","fg_perc","bg_count","bg_perc"))
  dt[,fg_perc:=as.numeric(stringr::str_remove(fg_perc,"\\%"))/100]
  dt[,bg_perc:=as.numeric(stringr::str_remove(bg_perc,"\\%"))/100]
  dt[,fc:=(fg_count+eta)/(bg_count+eta)]
  dt[,.(name,consensus,fc,pval,logpval,qval,fg_count,fg_perc,bg_count,bg_perc)]
}

hm_list <- lapply(levels(clusters_bed$group), function(ci) {
  dn_fn <- file.path(hom_dir,"results",sprintf("%s-Homer",ci),"homerResults.txt")
  dn_dt <- ParseHomerDenovo(dn_fn)
  kn_fn <- file.path(hom_dir,"results",sprintf("%s-Homer",ci),"knownResults.txt")
  kn_dt <- ParseHomerKnown(kn_fn)
  hm_dt <- rbindlist(list(denovo=dn_dt, known=kn_dt), idcol="set")
  hm_dt
})
names(hm_list) <- levels(clusters_bed$group)
hm_dt <- rbindlist(hm_list, idcol = "group")
fwrite(hm_dt, file.path(hom_dir, "results", "allResuls.txt"), sep="\t", col.names = TRUE)

Save all Homer motifs

mt_dt <- unique(hm_dt[,.(name)])

mt_list <- lapply(levels(clusters_bed$group), function(ci) {
  ci_dir <- file.path(hom_dir,"results",sprintf("%s-Homer",ci))
  ci_mfn <- c(
    list.files(file.path(ci_dir, "homerResults"), pattern = "motif\\d+.motif", full.names=TRUE),
    list.files(file.path(ci_dir, "knownResults"), pattern = "known\\d+.motif", full.names=TRUE)
  )
  ci_mot <- lapply(ci_mfn, universalmotif::read_homer)
  ci_mot
})
names(mt_list) <- levels(clusters_bed$group)

saveRDS(mt_list, file.path(hom_dir,"results","allMotifs.RDS"))

Motif archetypes

Create archetypes

Previously we pulled together Nematostella direct and inferred motifs from CisBP, and motifs additionally transferred based on the DBD %ID. We load them now, together with motifs found by Homer, then we do motif clustering and archetyping.

Load Homer motifs

# load Homer motifs
data_homer <- fread(file.path(hom_dir, "results","allResuls.txt"))
pwms_homer <- readRDS(file.path(hom_dir,"results","allMotifs.RDS"))
pwms_homer <- unlist(unname(pwms_homer))
names(pwms_homer) <- sapply(pwms_homer, function(x) x@name)

Parsed annotation for CisBP motifs

Load CisBP Nematostella motifs

# CisBP motifs + additionally transferred based on %ID
pwms_files_cisbp <- file.path("annotation","CisBP_ident_transfered_motifs.RDS")
pwms_cisbp <- readRDS(pwms_files_cisbp) # 2382
data_cisbp <- fread(str_replace(pwms_files_cisbp,"RDS","tsv"))
gen_cisbp <- data_cisbp$Gene_ID
nms_cisbp <- data_cisbp$name
pwms_cisbp_nm <- lapply(seq_along(pwms_cisbp), function(i) {
  motif <- pwms_cisbp[[i]]
  if (!is.na(gen_cisbp[i])) motif@altname <- gen_cisbp[i]
  if (!is.na(nms_cisbp[i])) motif@extrainfo <- nms_cisbp[i]
  return(motif)
})
names(pwms_cisbp_nm) <- names(pwms_cisbp)

Save all motifs

pwms <- c(pwms_homer,pwms_cisbp)
saveRDS(pwms, file.path(res_dir,"motifs-all.rds"))

Parameters for similarity and archetyping

# motif similarity
similarity = "PPM" 
method = "PCC"
normalise.scores = TRUE
if (normalise.scores == TRUE) {
  normalization  <- "norm"
} else {
  normalization  <- ""
}
# clustering for archetyping
min_cluster_similarity <- 0.8
hclust_method <- "complete"
dist_method <- "euclidean"
# archetyping threshold
IC_threshold <- 0.5
len_threshold <- 8

Calculate pairwise similarity

# motifs
pwms <- readRDS(file.path(res_dir,"motifs-all.rds"))

# similarity
sim_mat <- compare_motifs(
  motifs = pwms, 
  use.type = similarity, 
  method = method, 
  normalise.scores = normalise.scores, 
  min.position.ic = 0,
  min.mean.ic = 0
)
rownames(sim_mat) <- colnames(sim_mat) <- names(pwms)
saveRDS(sim_mat,file.path(res_dir,sprintf("motifs-similarity-%s-%s%s.rds",similarity,method,normalization)))

Cluster and order similarity matrix, then generate archetype motifs.

Choose the minimum cluster similarity appropriately so that clusters of motifs to archetype contain only similar motifs (e.g. when using a higher value of 0.8, many cluster contain outlier motifs, with lower values these get split).

# motifs and similarity
pwms <- readRDS(file.path(res_dir,"motifs-all.rds"))
sim_mat <- readRDS(file.path(res_dir,sprintf("motifs-similarity-%s-%s%s.rds",similarity,method,normalization)))

# ordering
reclust_motifs <- TRUE
ord <- rownames(sim_mat)
if (reclust_motifs) {
  hc <- hclust(dist(sim_mat, method = dist_method), method = hclust_method)
  ord <- hc$labels[hc$order]
}
cuts <- seq(200,300,10)
cuts_scores <- sapply(cuts, function(h) {
  ctr <- cutree(hc, k=h)
  cl_scores <- sapply(unique(ctr),function(x) {
    ms <- names(ctr[ctr==x])
    within_cl <- median(sim_mat[ms,ms], na.rm=TRUE)
    between_cl <- median(unlist(sim_mat[!(rownames(sim_mat)%in%ms),ms]), unlist(sim_mat[ms,!(colnames(sim_mat)%in%ms)]), na.rm=TRUE)
    if (is.na(between_cl)) between_cl <- 1
    within_cl/between_cl
  })
  mean(cl_scores, na.rm=TRUE)
})
k <- cuts[which.max(cuts_scores)]
plot(cuts,cuts_scores)
abline(v=k)
ctr <- cutree(hc, k=k)

# archetyping
arch <- mta_merge_archetype(
  sim_mat = sim_mat, 
  motifs = pwms,
  clusters = ctr,
  min_cluster_similarity = min_cluster_similarity,
  recluster = FALSE, 
  block_filter = TRUE,
  bkg = rep(0.25,4), 
  pseudocount = 0.0001, 
  IC_threshold = IC_threshold, 
  len_threshold = len_threshold,
  occupancy_threshold = 1,
  verbose = TRUE
)
# list of 1118 archetypes including 3276 motifs (131 archetype(s) fail filters)
arch <- arch[sapply(arch, function(x) length(x)>0)]
arch_file <- file.path(res_dir,sprintf("motif-archetypes-%s-%s%s-%s-IC%s-%sbp.rds",similarity,method,normalization,min_cluster_similarity,IC_threshold,len_threshold))
saveRDS(arch, arch_file)

Annotate archetype motifs (dictionary).

# archetype motifs
arch_file <- file.path(res_dir,"motif-archetypes-PPM-PCCnorm-0.8-IC0.5-8bp.rds")
arch <- readRDS(arch_file)

# cisbp direct and inferred motifs for nematostella assigned to genes
CisBP_ann_file <- file.path("annotation","CisBP_ident_transfered_motifs.tsv")
cisbp_ann <- fread(CisBP_ann_file)
setnames(cisbp_ann,c("Gene_ID","Motif_ID"),c("gene","motif"))
cisbp_tfs <- cisbp_ann[,.(gene,motif)][]
TF_motifs_file <- file.path(res_dir,"CisBP_ident_transfered_genes_to_motifs.tsv")
fwrite(cisbp_tfs, TF_motifs_file, sep="\t")

# cisbp family annotation for all direct TFs
CisBP_family_annotation_file <- file.path("annotation","CisBP_2021_08_11_TF_Information.txt")

# tf annotations
TF_annotation_file <- file.path("annotation","curated_TFh_Nvec_DToL_names.tsv")
TF_family_annotation_file <- file.path("annotation","gene_families_searchinfo.csv")

# mapping between CisBP and our TF family annotations
CisBP_TF_family_mapping_file <- file.path("annotation","CisBP_TF_mapping.tsv")

# make dictionary
dict <- mta_archetype_dictionary(
  arch = arch, 
  TF_annotation_file = TF_annotation_file, 
  TF_motifs_file = TF_motifs_file,
  TF_family_annotation_file = TF_family_annotation_file,
  CisBP_family_annotation_file = CisBP_family_annotation_file,
  CisBP_TF_family_mapping_file = CisBP_TF_family_mapping_file
)

# save dictonary
fwrite(dict, str_replace(arch_file, ".rds$",".dict"), sep="\t", quote=FALSE, col.names = TRUE)
arch_file <- file.path(res_dir,"motif-archetypes-PPM-PCCnorm-0.8-IC0.5-8bp.rds")
dict <- fread(str_replace(arch_file, ".rds$",".dict"))
dict_nmot <- dict[,.(archetype_name,archetype_num_motifs)]
gp_nmot <- ggplot(dict_nmot, aes(archetype_num_motifs)) + 
  geom_bar(color="white") +
  scale_x_continuous(expand = expansion(mult=0.01), breaks = c(1,seq(10,100,10))) +
  scale_y_continuous(expand = expansion(mult=c(0,0.01))) +
  theme(panel.grid.major = element_line(size=0.5)) +
  labs(x="number of motifs per archetype", y="number of archetypes")
gp_nmot

Plot archetyping clusters.

# plot archetyping clusters
archetyping_file <- file.path(
  fig_dir,
  basename(file.path(str_replace(arch_file, "\\.rds$", "-archetyping.pdf")))
)
mta_plot_archetype(arch = arch, dict = dict, output_file = archetyping_file)

# plot archetype logos
pdf(file.path(fig_dir, basename(str_replace(arch_file, "\\.rds$", "-archetypes.pdf"))), width=8, height=3)
for (x in seq_along(arch)) {
  message(x)
  motif <- arch[[x]]$ppm_consensus
  motif@name <- dict[archetype_num==paste0("ARCH",x)]$archetype_name[1]
  motif@alphabet <- "DNA"
  tryCatch({
    print(view_motifs(
      motifs = motif,
      relative_entropy = TRUE,
      normalise.scores = FALSE
    ) + labs(title = x))
  }, error=function(e) message(sprintf("Failed to plot ARCH%s\n%s",x,e)))
}
dev.off()

Save archetype motifs.

# load archetyping results
arch <- readRDS(arch_file)
arch_list <- lapply(arch, function(x) x$ppm_consensus)

# add archetype names to motifs
dict <- fread(file.path(
  res_dir,
  "motif-archetypes-PPM-PCCnorm-0.8-IC0.5-8bp.dict"
))
arch_nms <- dict[match(
  sapply(arch_list, function(x) x@name),
  archetype
)]$archetype_name
names(arch_list) <- arch_nms
arch_list_nm <- lapply(seq_along(arch_list), function(i) {
  x <- arch_list[[i]]
  x@name <- arch_nms[i]
  x
})
names(arch_list_nm) <- arch_nms
saveRDS(arch_list_nm, str_replace(arch_file, ".rds$","-pwms.rds"))

universalmotif::write_homer(
  motifs = arch_list_nm, 
  file = str_replace(arch_file, ".rds$","-pwms.homer"), 
  overwrite = TRUE
)

arch_list_nm <- lapply(arch_list_nm, function(x) {
  x@alphabet <- "DNA"
  x
}) 
universalmotif::write_meme(
  motifs = arch_list_nm, 
  file = str_replace(arch_file, ".rds$","-pwms.meme"), 
  overwrite = TRUE
)

Plot heatmap of similarity of all motifs used in archetyping (i.e. motifs after filtering), with archetyping clusters indicated.

require(ComplexHeatmap)

pwms <- readRDS(file.path(res_dir,"motifs-all.rds"))
sim_mat_file <- file.path(res_dir,"motifs-similarity-PPM-PCCnorm.rds")
sim_mat <- readRDS(sim_mat_file)
arch_file <- file.path(res_dir,"motif-archetypes-PPM-PCCnorm-0.8-IC0.5-8bp.rds")
arch <- readRDS(arch_file)
dict <- fread(str_replace(arch_file, ".rds$",".dict"))
names(arch) <- unique(dict$archetype_name)
heatmap_file <- file.path(
  fig_dir,
  str_replace_all(basename(sim_mat_file), c("similarity"="similarity-archetypes", ".rds$"=".pdf"))
)
mta_plot_archetype_heatmap(
  sim_mat = sim_mat,
  arch = arch,
  dict = dict,
  output_file = heatmap_file,
  height = 14,
  width = 16
)

Mapping archetpes and TFs

We first identify expressed genes and TFs in groups that were defined by clustering.

# TF annotation
tfs_annt <- fread(
  file.path("annotation", "curated_TFh_Nvec_DToL_names.tsv"),
  header = FALSE
)
setnames(tfs_annt, c("gene", "pfam", "og"))

# groupped genes
mark_dt <- fread(file.path("RNASEQ_QUANTIFICATION", "results", "genes_clusters.tsv"))
mark_dt[, group := paste0("G", group)]
gene_dt <- fread(file.path("RNASEQ_QUANTIFICATION", "results", "genes_clusters_predicted.tsv"))
gene_dt[, group := group_xgb]
group_dt <- rbindlist(list(
  clustered = mark_dt[, .(gene, group)],
  predicted = gene_dt[, .(gene, group)]
),  idcol = "method")
group_dt[, group := factor(
  group,
  levels = paste0("G", unique(
    sort(as.integer(str_remove(group_dt$group, "G")))
  ))
)]
message(sprintf(
  "Total genes: %i", length(group_dt$gene)
))
## Total genes: 20535
# expressed genes
con_mt <- read.table(
  file.path("RNASEQ_QUANTIFICATION", "raw_counts_rnaseq.tsv"),
  header=TRUE, row.names = 1
)
expr_genes <- names(which(apply(con_mt, 1, function(x) sum(x) > 10) == TRUE))
message(sprintf(
  "Expressed genes: %i", length(expr_genes)
))
## Expressed genes: 20080
# expressed TFs
expr_tfs <- expr_genes[expr_genes %in% tfs_annt$gene]
message(sprintf(
  "Expressed TFs: %i", length(expr_tfs)
))
## Expressed TFs: 611
expr_dt <- group_dt[gene %in% expr_genes]
expr_dt[, TF := gene %in% expr_tfs]
expr_dt[, TF := factor(TF, levels = c(TRUE, FALSE))]
setorder(expr_dt, group, TF)

How many TFs (do not) have archetypes, and how many archetypes (do not) have a TF?

# archetype dictionary
dict_fnm <- file.path(
  res_dir,
  "motif-archetypes-PPM-PCCnorm-0.8-IC0.5-8bp.dict"
)
dict <- fread(dict_fnm)
message(sprintf(
  "Fraction of TFs with motifs: %s", 
  mean(expr_tfs %in% dict$gene)
))

# TF annotation
tf_exp <- unique(tfs_annt[gene %in% expr_tfs][, .SD[1], gene])

# TFs without an archetype
tfs_dt <- unique(dict[gene != ""][
  , .(archetype_name, gene)])[
    , .(number_of_motifs = .N), gene][
      order(number_of_motifs)]
orp_tfs <- unique(tf_exp[!gene %in% dict$gene]$gene)
gp_tfs <- ggplot(tfs_dt, aes(number_of_motifs)) + 
  geom_bar(color = "white") + 
  scale_x_continuous(expand = expansion(mult = 0.01), breaks = c(1, seq(10, 100, 10))) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.01))) +
  theme(panel.grid.major = element_line(size = 0.5)) +
  labs(
    x = "number of archetype motifs per TF",
    y="number of TFs",
    caption = sprintf(
      "%s TFs with motif(s); %s TFs without a motif",
      nrow(tfs_dt), length(orp_tfs)
    )
  )

# archetypes without a TF
arc_dt <- unique(dict[gene != ""][
  , .(archetype_name, gene)])[
    , .(number_of_genes = .N), archetype_name][
      order(number_of_genes)]
orp_arc <- unique(dict[!archetype_name %in% arc_dt$archetype_name]$archetype_name)
gp_arc <- ggplot(arc_dt, aes(number_of_genes)) + 
  geom_bar(color="white") + 
  scale_x_continuous(expand = expansion(mult=0.01)) +
  scale_y_continuous(expand = expansion(mult=c(0,0.01))) +
  theme(panel.grid.major = element_line(size=0.5)) +
  labs(
    x = "number of TFs per archetype motif",
    y="number of archetype motifs",
    caption = sprintf(
      "%s motifs with gene(s); %s motifs without a gene",
      nrow(arc_dt), length(orp_arc)
    )
  )

gp_tfs / gp_arc

How many TFs expressed in each group have motifs?

tfs_dt <- merge.data.table(
  expr_dt[TF == TRUE][, TF := NULL],
  unique(dict[, .(gene, og, pfam, archetype, archetype_name)]),
  by = "gene", all.x = TRUE, sort = FALSE
)

dt_tf_1 <- unique(
  tfs_dt[, .(gene, archetype, group)][
    order(archetype)][
      , .SD[1], gene]
)
gp_tf_1 <- ggplot(
  dt_tf_1,
  aes(group, fill = !is.na(archetype))) +
  geom_bar() +
  scale_fill_manual(
    "",
    values = c("TRUE" = "#0ab30a", "FALSE" = "#b41b1b"),
    labels = c(
      "TRUE" = sprintf(
        "motif (%i)",
        length(unique(dt_tf_1[!is.na(archetype)]$gene))
      ),
      "FALSE" = sprintf(
        "no motif (%i)",
        length(unique(dt_tf_1[is.na(archetype)]$gene))
      )
    )
  ) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.01))) +
  labs(y = "all TFs", x = "")

dt_tf_2 <- unique(
  tfs_dt[method == "clustered"][
    , .(gene, archetype, group)][
      order(archetype)][
        , .SD[1], gene]
)
gp_tf_2 <- ggplot(
  dt_tf_2,
  aes(group, fill = !is.na(archetype))) +
  geom_bar() +
  scale_fill_manual(
    "",
    values = c("TRUE" = "#0ab30a", "FALSE" = "#b41b1b"),
    labels = c(
      "TRUE" = sprintf(
        "motif (%i)",
        length(unique(dt_tf_2[!is.na(archetype)]$gene))
      ),
      "FALSE" = sprintf(
        "no motif (%i)",
        length(unique(dt_tf_2[is.na(archetype)]$gene))
      )
    )
  ) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.01))) +
  labs(y = "marker TFs", x = "")

gp_ft <- (gp_tf_1 / gp_tf_2 & theme(
  legend.position = "top",
  panel.grid.major.y = element_line(size = 0.5)
  ))
gp_ft

To match missing TFs and motifes, we classified them in TF families. We will focus on TFs used for clustering here.

# TF families
TF_family_annotation_file <- file.path("annotation","gene_families_searchinfo.csv")
tf_fams <- fread(TF_family_annotation_file)

# clustering markers
mark_dt <- fread(file.path(
  "RNASEQ_QUANTIFICATION", "results", "genes_clusters.tsv"
))

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
# missing TFs
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
orp_tfs_dt <- tfs_annt[gene %in% mark_dt$gene]
orp_tfs_dt <- unique(orp_tfs_dt[gene %in% orp_tfs][, .SD[1], gene])
# get family info from gene og
orp_tfs_dt[og!="", tf_family := str_remove(og, "\\.(?<=\\.).+")]  
orp_tfs_dt[, tf_family := str_replace_all(tf_family, "AP2", "AP-2")]
# summarize
orp_tfs_dtn <- orp_tfs_dt[, .N,tf_family][
  , prop := N / sum(N)][
    order(-N)]
orp_tfs_dtn_plot <- copy(orp_tfs_dtn)
orp_tfs_dtn_plot <- unique(orp_tfs_dtn_plot)
orp_tfs_dtn_plot[, tf_family := factor(
  tf_family, 
  levels = orp_tfs_dtn_plot$tf_family
)]

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
# missing motifs
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
orp_arc_dt <- unique(dict[archetype_name %in% orp_arc])
# get family info from archetype name
orp_arc_dt[tf_family == "", tf_family := str_extract(
  archetype_name, paste(tf_fams[[1]], collapse = "|")
)]
# get family info from motifs names
orp_arc_na <- orp_arc_dt[is.na(tf_family), ][
  , tf_family := str_extract(motif,paste(tf_fams[[1]],collapse="|"))][
    , .(archetype,tf_family)][
      !is.na(tf_family)]
orp_arc_vc <- structure(orp_arc_na$tf_family, names = orp_arc_na$archetype)
orp_arc_dt[archetype %in% names(orp_arc_vc), tf_family := orp_arc_vc[archetype]]
# guess family info from archetype names
grep_list2 <- list(
  "fox" = "Forkhead",
  "hox" = "Homeodomains",
  "sox" = "HMGbox_Sox",
  "runx|runt" = "Runt_Runx",
  "mads|srf" = "MADS-box_SRF",
  "Myb_DNA-bind" = "Myb"
)
nms <- unlist(strsplit(tf_fams[[2]], ","))
fms <- unname(unlist(lapply(1:nrow(tf_fams), function(i) 
  rep(tf_fams[i,1], length(strsplit(as.character(tf_fams[i,2]), ",")[[1]]))
)))
grep_list1 <- fms; names(grep_list1) <- nms
grep_list <- c(grep_list1, grep_list2)
for (gp in names(grep_list)) {
  orp_arc_dt[is.na(tf_family) & grepl(gp, archetype, ignore.case = TRUE),
  tf_family := grep_list[[gp]]]
}
orp_arc_dt[, tf_family := str_replace_all(tf_family, "AP2", "AP-2")]
orp_arc_dt[is.na(tf_family), tf_family := "unknown"]
orp_arc_dtn <- unique(orp_arc_dt[, .(archetype, tf_family)])
# summarize with unknown
orp_arc_dtn_1 <- orp_arc_dt[
  , .N, tf_family][
    , prop := N / sum(N)][order(-N)]
orp_arc_dtn_1[, tf_family := factor(tf_family, levels = orp_arc_dtn_1$tf_family)]
# summarize without unknown
orp_arc_dtn_2 <- orp_arc_dt[tf_family != "unknown"][
  , .N, tf_family][
    , prop := N / sum(N)][order(-N)]
orp_arc_dtn_2[, tf_family := factor(tf_family, levels = orp_arc_dtn_2$tf_family)]

# colors
tf_fams <- unique(sort(c(tf_fams[[1]], unique(orp_tfs_dt$tf_family))))
tf_fams <- tf_fams[tf_fams %in% c(as.character(orp_tfs_dtn$tf_family), as.character(orp_arc_dt$tf_family))]
tf_fams <- str_replace_all(tf_fams, "AP2", "AP-2")
tf_fams_cols <- structure(
  c(colorRampPalette(RColorBrewer::brewer.pal(8,'Paired'))(length(tf_fams)), "khaki", "grey"),
  names = c(tf_fams, "other", "unknown")
)

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
# plots
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
gp_orp_tfs <- ggplot(orp_tfs_dtn_plot, aes("", N, fill = tf_family)) +
  geom_bar(stat = "identity", width = 1, color = "black") +
  coord_polar("y", start=0) +
  scale_fill_manual(values = tf_fams_cols, limits = force) +
  geom_text(
    aes(label = ifelse(
      N < 1,
      "",
      sprintf("%s (%s)", tf_family, N)
    ), x = 1.4),
    position = position_stack(vjust = 0.5),
    color = "black"
  ) +
  labs(title = "orphan TFs") +
  theme_void() +
  theme(legend.position = "none")

gp_orp_arc_1 <- ggplot(orp_arc_dtn_1, aes("", N, fill = tf_family)) +
  geom_bar(stat="identity", width = 1, color = "black") +
  coord_polar("y", start = 0) +
  scale_fill_manual(values = tf_fams_cols, limits = force) +
  geom_text(
    aes(label = ifelse(
      N < 10,
      "",
      sprintf("%s (%s)", tf_family, N)
    ), x = 1.4),
    position = position_stack(vjust = 0.5),
    color = "black"
  ) +
  labs(title = "orphan motif archetypes") +
  theme_void() +
  theme(legend.position = "none")

gp_orp_arc_2 <- ggplot(orp_arc_dtn_2, aes("", N, fill = tf_family)) +
  geom_bar(stat="identity", width = 1, color = "black") +
  coord_polar("y", start = 0) +
  scale_fill_manual(values = tf_fams_cols, limits = force) +
  geom_text(
    aes(label = ifelse(
      N < 1,
      "",
      sprintf("%s (%s)", tf_family, N)
    ), x = 1.4),
    position = position_stack(vjust = 0.5),
    color = "black"
  ) +
  theme_void() +
  labs(title = "(without unknown)") +
  theme(legend.position = "none")

# save
unmatched_lst <- list(
  tfs = orp_tfs_dt,
  motifs = orp_arc_dt
)
saveRDS(unmatched_lst, file.path(
  res_dir, "unmatched-tfs-motifs.rds"
))

# plot
gp_orp_tfs + gp_orp_arc_1 + gp_orp_arc_2

orp_dtn <- rbindlist(list(
  "TFs" = orp_tfs_dtn,
  "archetypes" = orp_arc_dtn_1
), idcol = "variable")
orp_dtn <- orp_dtn[tf_family != "unknown"]

gp_orp <- ggplot(orp_dtn, aes(tf_family)) + 
  geom_bar(
    data = subset(orp_dtn, variable == "TFs"), 
    aes(y = N, fill = tf_family), 
    stat = "identity",
    position = "dodge") +
  geom_bar(
    data = subset(orp_dtn, variable == "archetypes"), 
    aes(y = -N, fill = tf_family), 
    stat = "identity",
    position = "dodge"
  ) + 
  coord_flip() +
  scale_fill_manual(values = tf_fams_cols, limits = force) +
  scale_y_continuous(limits = c(NA, NA), labels = abs) +
  geom_hline(yintercept = 0, colour = "black", size = 0.25) +
  theme(
    legend.position = "none", 
    axis.line = element_blank(),
    panel.grid.major.x = element_line(size=0.5)
  ) +
  labs(y = "TFs \t motifs", x = "TF family")
gp_orp

Motif scores in peaks

Score archetype motifs in consensus peaks

Motif enrichment in groups defined before

Motif enrichment heatmap

# enrichment scores
dt <- fread(
  file.path(res_dir, "motif-enrich-archetypes-mona.tsv")
)
setnames(dt, "label", "group")

# qvalue
dat_qv <- dcast.data.table(
  unique(dt[, .(motif, group, padj)]), motif ~ group, value.var = "padj"
)
mat_qv <- data.matrix(dat_qv)[,-1]
mat_qv[is.na(mat_qv)] <- 1
rownames(mat_qv) <- dat_qv$motif
write.table(
  mat_qv,
  file.path(res_dir, "motif-enrich-archetypes-mona-qvalue.tsv"),
  sep = "\t", quote = FALSE
)

# log2fc
dat_fc <- dcast.data.table(
  unique(dt[,.(motif, group, fc)]), motif ~ group, value.var = "fc"
)
mat_fc <- data.matrix(dat_fc)[,-1]
mat_fc[is.na(mat_fc)] <- 0
rownames(mat_fc) <- dat_fc$motif
write.table(
  mat_fc,
  file.path(res_dir, "motif-enrich-archetypes-mona-fc.tsv"),
  sep = "\t", quote = FALSE
)

# convert to numeric (replacing , with .) and multiply with -1 (so that highly significant = higher value)
dt[, minuslog10qval := -1*log10(padj)]

# filtering
ids <- apply(mat_fc, 1, function(x) max(x) > 1) &
  apply(mat_qv, 1, function(x) !is.infinite(min(x)) & !min(x) > 0.001)

# clustering
hc <- hclust(dist(mat_fc[ids, ]), method = "ward.D2")
ds <- dt[motif %in% rownames(mat_fc)[ids]]
ds[, motif := factor(motif, levels = rev(rownames(mat_fc)[ids]))]

# ordering
mord <- order(apply(mat_fc[ids,], 1, which.max))
ds[, motif := factor(motif, levels=rev(rownames(mat_fc[ids,])[mord]))]

# limit -log10FDR range 
ds[, minuslog10qval := pmin(minuslog10qval, 20)]

# add archetype gene annotations
dict <- fread(file.path(
  res_dir,
  "motif-archetypes-PPM-PCCnorm-0.8-IC0.5-8bp.dict"
))
dc <- unique(dict[,.(archetype_name, gene)])
setnames(dc, "archetype_name", "motif")
mt_dt <- merge.data.table(
  ds, dc, 
  by = "motif",
  all.x = TRUE
)

# add tf annotations
marks_gold <- fread(
  file.path("annotation", "golden-marks-230116.tsv"),
  fill = TRUE
)[, 1:2]
setnames(marks_gold, c("gene", "name"))
marks_gold[, name := str_remove(name, "^Nv")]

tfs_annt <- fread(
  file.path("annotation", "curated_TFh_Nvec_DToL_names.tsv"),
  header = FALSE
)
setnames(tfs_annt, c("gene", "pfam", "og"))

marks_tfs <- merge.data.table(
  tfs_annt,
  marks_gold,
  by = "gene", all = TRUE, sort = FALSE
)
marks_tfs[is.na(marks_tfs)] <- ""

mt_marks_dt <- merge.data.table(
  mt_dt, marks_tfs,
  by = "gene",
  all.x = TRUE
)
mt_marks_dt[is.na(name), name := ""]
mt_marks_dt[name == "" & pfam != "", name := pfam]
mt_marks_dt[name == "", name := motif]
mt_marks_dt[, motif := factor(motif, levels = levels(ds$motif))]

# add labels
mt_marks_dt[, label := str_extract(name, "(?<=BestGuess_).+")]
mt_marks_dt[is.na(label), label := str_extract(name, "(?<=ARCH\\d{1,3}_).+")]
mt_marks_dt[is.na(label), label := name]
mt_marks_dt[!(pval < 1e5 & fc > 4), label := NA]

# filter labels by RNA plot labels
labels_dt <- marks_tfs[name != ""][, .(name, gene)]
labels_v <- structure(labels_dt$name, names = labels_dt$gene)
mt_marks_dt[is.na(label), label := labels_v[gene]]
mt_marks_dt[!(pval < 1e5 & fc > 3), label := NA]

# plot
labels_p <- function(
  data,
  column_name,
  padj_thrs = 1e-3,
  fc_thrs = 4,
  operation = c("|","&")[1]
) {
  return(
    function(m) {
      dl <- copy(data)
      dl[, column_name := dl[[column_name]]]
      dl <- dl[order(padj)][, .SD[1], .(column_name)]
      dl <- dl[match(m, column_name)]
      if (any(c("|", "OR", "or", "union") %in% operation[1])) {
        m[dl$padj < padj_thrs | dl$fc > fc_thrs]
      } else if (any(c("&", "AND", "and", "intersect") %in% operation[1])) {
        m[dl$padj < padj_thrs & dl$fc > fc_thrs]
      }
    }
  )
}

gp <- ggplot(mt_marks_dt, aes(group, motif, label = label)) +
  geom_point(
    aes(size = minuslog10qval, fill = fc),
    shape = 21,
    color = "black"
  ) + 
  scale_fill_gradientn(
    name = "enrichmnet\nfold change",
    colours = c("white","#fee5d9","#fcae91","#fb6a4a","#de2d26","#a50f15", "#7a0105"),
    breaks = c(0, 2, 4, 6)
  ) +
  scale_size_continuous(
    name = "-log10 FDR", 
    breaks = c(0, 10, 20)
  ) +
  #scale_y_discrete(
  #  breaks = labels_p(
  #    data = mt_marks_dt,
  #    column_name = "motif",
  #    padj_thrs = 1e-3,
  #    fc_thrs = 2,
  #    operation = "&"
  #  )
  #) +
  geom_text_repel(
    force = 0.5,
    nudge_x = -0.25,
    direction = "y",
    hjust = 1,
    segment.size = 0.2
  ) +
  theme(
    panel.grid.major = element_line(size = 0.25),
    axis.text.y = element_text(size = 8)
  )
gp
## Warning: Removed 1459 rows containing missing values (geom_text_repel).

Mapping archetypes and TFs based on score and gene expression in groups

Match by TF expression and motif enrichment in groups.

unmatched_lst <- readRDS(file.path(
  res_dir, "unmatched-tfs-motifs.rds"
))

tfs_df <- unmatched_lst$tfs
mot_df <- unmatched_lst$motifs

tfs_fm <- unique(tfs_df$tf_family)
mot_fm <- unique(mot_df$tf_family)

# TFs group
tfs_grp <- fread(
  file.path("RNASEQ_QUANTIFICATION", "results", "genes_clusters.tsv")
)[, .(gene, group)]
tfs_grp <- tfs_grp[gene %in% tfs_df$gene]
tfs_grp[, group := paste0("G", group)]
tfs_vct <- structure(tfs_grp$group, names = tfs_grp$gene)
tfs_df[, group := tfs_vct[gene]]

# motif group
mot_mat <- read.table(
  file.path(res_dir, "motif-enrich-archetypes-mona-fc.tsv")
)
mot_vct <- apply(mot_mat, 1, function(x) colnames(mot_mat)[which.max(x)])
mot_enr <- apply(mot_mat, 1, max)
mot_df[, group := mot_vct[archetype_name]]
mot_df[, fc := mot_enr[archetype_name]]

# matching
tfs_mot_map <- function(tfs_df, mot_df) {
  tfs <- tfs_df$gene
  mts <- sapply(tfs, function(g) {
    tfs_dg <- tfs_df[gene == g]
    mot_dg_fam <- mot_df[tf_family == tfs_dg$tf_family]
    mot_dg_grp <- mot_dg_fam[group == tfs_dg$group]
    if (!nrow(mot_dg_grp) > 0) {
      NA
    } else {
      mot_dg_top <- mot_dg_grp[order(-fc)][1]
      mot_dg_top$archetype_name
    }
  }, USE.NAMES = TRUE)
  message(sprintf(
    "%i TFs were mapped to motifs;",
    length(mts[!is.na(mts)])
  ))
  message(sprintf(
    "%i TFs could not be mapped.",
    length(mts[is.na(mts)])
  ))
  mts
}
tf_mot_mappings <- tfs_mot_map(tfs_df, mot_df)
## 30 TFs were mapped to motifs;
## 42 TFs could not be mapped.

Update dictionary

dict_fnm <- file.path(
  res_dir,
  "motif-archetypes-PPM-PCCnorm-0.8-IC0.5-8bp.dict"
)
dict <- fread(dict_fnm)

# TF annotation
tfs_annt <- fread(
  file.path("annotation", "curated_TFh_Nvec_DToL_names.tsv"),
  header = FALSE
)
setnames(tfs_annt, c("gene", "pfam", "og"))

tf_mot_mappings <- tf_mot_mappings[!is.na(tf_mot_mappings)]
tf_mot_annots <- tfs_annt[match(names(tf_mot_mappings), gene)]
tf_mot_og <- structure(tf_mot_annots$og, names = names(tf_mot_mappings))
tf_mot_pfam <- structure(tf_mot_annots$pfam, names = names(tf_mot_mappings))
tf_mot_dt <- data.table(
  "archetype_name" = tf_mot_mappings,
  "gene" = names(tf_mot_mappings)
)
dict_new <- merge.data.table(
  dict, tf_mot_dt, by = "archetype_name",
  all = TRUE, sort = FALSE
)
dict_new[!is.na(gene.y), gene.x := gene.y]
dict_new[, gene.y := NULL]
setnames(dict_new, "gene.x", "gene")
dict_new[gene %in% tf_mot_dt$gene, og := tf_mot_og[gene]]
dict_new[gene %in% tf_mot_dt$gene, pfam := tf_mot_pfam[gene]]
fwrite(
  dict_new,
  file.path(res_dir, "motif-archetypes-PPM-PCCnorm-0.8-IC0.5-8bp-mapped.dict"),
  sep = "\t"
)

We again identify expressed genes and TFs in groups that were defined by clustering.

# TF annotation
tfs_annt <- fread(
  file.path("annotation", "curated_TFh_Nvec_DToL_names.tsv"),
  header = FALSE
)
setnames(tfs_annt, c("gene", "pfam", "og"))

# groupped genes
mark_dt <- fread(file.path("RNASEQ_QUANTIFICATION", "results", "genes_clusters.tsv"))
mark_dt[, group := paste0("G", group)]
gene_dt <- fread(file.path("RNASEQ_QUANTIFICATION", "results", "genes_clusters_predicted.tsv"))
gene_dt[, group := group_xgb]
group_dt <- rbindlist(list(
  clustered = mark_dt[, .(gene, group)],
  predicted = gene_dt[, .(gene, group)]
),  idcol = "method")
group_dt[, group := factor(
  group,
  levels = paste0("G", unique(
    sort(as.integer(str_remove(group_dt$group, "G")))
  ))
)]
expr_dt <- group_dt[gene %in% expr_genes]
expr_dt[, TF := gene %in% expr_tfs]
expr_dt[, TF := factor(TF, levels = c(TRUE, FALSE))]
setorder(expr_dt, group, TF)

How many TFs (do not) have archetypes, and how many archetypes (do not) have a TF?

# archetype dictionary
dict_fnm <- file.path(
  res_dir,
  "motif-archetypes-PPM-PCCnorm-0.8-IC0.5-8bp-mapped.dict"
)
dict <- fread(dict_fnm)
message(sprintf(
  "Fraction of TFs with motifs: %s", 
  mean(expr_tfs %in% dict$gene)
))

# TF annotation
tf_exp <- unique(tfs_annt[gene %in% expr_tfs][, .SD[1], gene])

# TFs without an archetype
tfs_dt <- unique(dict[gene != ""][
  , .(archetype_name, gene)])[
    , .(number_of_motifs = .N), gene][
      order(number_of_motifs)]
orp_tfs <- unique(tf_exp[!gene %in% dict$gene]$gene)
gp_tfs <- ggplot(tfs_dt, aes(number_of_motifs)) + 
  geom_bar(color = "white") + 
  scale_x_continuous(expand = expansion(mult = 0.01), breaks = c(1, seq(10, 100, 10))) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.01))) +
  theme(panel.grid.major = element_line(size = 0.5)) +
  labs(
    x = "number of archetype motifs per TF",
    y="number of TFs",
    caption = sprintf(
      "%s TFs with motif(s); %s TFs without a motif",
      nrow(tfs_dt), length(orp_tfs)
    )
  )

# archetypes without a TF
arc_dt <- unique(dict[gene != ""][
  , .(archetype_name, gene)])[
    , .(number_of_genes = .N), archetype_name][
      order(number_of_genes)]
orp_arc <- unique(dict[!archetype_name %in% arc_dt$archetype_name]$archetype_name)
gp_arc <- ggplot(arc_dt, aes(number_of_genes)) + 
  geom_bar(color="white") + 
  scale_x_continuous(expand = expansion(mult=0.01)) +
  scale_y_continuous(expand = expansion(mult=c(0,0.01))) +
  theme(panel.grid.major = element_line(size=0.5)) +
  labs(
    x = "number of TFs per archetype motif",
    y="number of archetype motifs",
    caption = sprintf(
      "%s motifs with gene(s); %s motifs without a gene",
      nrow(arc_dt), length(orp_arc)
    )
  )

gp_tfs / gp_arc

How many TFs expressed in each group have motifs?

tfs_dt <- merge.data.table(
  expr_dt[TF == TRUE][, TF := NULL],
  unique(dict[, .(gene, og, pfam, archetype, archetype_name)]),
  by = "gene", all.x = TRUE, sort = FALSE
)

dt_tf_1 <- unique(
  tfs_dt[, .(gene, archetype, group)][
    order(archetype)][
      , .SD[1], gene]
)
gp_tf_1 <- ggplot(
  dt_tf_1,
  aes(group, fill = !is.na(archetype))) +
  geom_bar() +
  scale_fill_manual(
    "",
    values = c("TRUE" = "#0ab30a", "FALSE" = "#b41b1b"),
    labels = c(
      "TRUE" = sprintf(
        "motif (%i)",
        length(unique(dt_tf_1[!is.na(archetype)]$gene))
      ),
      "FALSE" = sprintf(
        "no motif (%i)",
        length(unique(dt_tf_1[is.na(archetype)]$gene))
      )
    )
  ) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.01))) +
  labs(y = "all TFs", x = "")

dt_tf_2 <- unique(
  tfs_dt[method == "clustered"][
    , .(gene, archetype, group)][
      order(archetype)][
        , .SD[1], gene]
)
gp_tf_2 <- ggplot(
  dt_tf_2,
  aes(group, fill = !is.na(archetype))) +
  geom_bar() +
  scale_fill_manual(
    "",
    values = c("TRUE" = "#0ab30a", "FALSE" = "#b41b1b"),
    labels = c(
      "TRUE" = sprintf(
        "motif (%i)",
        length(unique(dt_tf_2[!is.na(archetype)]$gene))
      ),
      "FALSE" = sprintf(
        "no motif (%i)",
        length(unique(dt_tf_2[is.na(archetype)]$gene))
      )
    )
  ) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.01))) +
  labs(y = "marker TFs", x = "")

gp_ft <- (gp_tf_1 / gp_tf_2 & theme(
  legend.position = "top",
  panel.grid.major.y = element_line(size = 0.5)
  ))
gp_ft

To match missing TFs and motifes, we classified them in TF families. We will focus on TFs used for clustering here.

# TF families
TF_family_annotation_file <- file.path("annotation","gene_families_searchinfo.csv")
tf_fams <- fread(TF_family_annotation_file)

# clustering markers
mark_dt <- fread(file.path(
  "RNASEQ_QUANTIFICATION", "results", "genes_clusters.tsv"
))

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
# missing TFs
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
orp_tfs_dt <- tfs_annt[gene %in% mark_dt$gene]
orp_tfs_dt <- unique(orp_tfs_dt[gene %in% orp_tfs][, .SD[1], gene])
# get family info from gene og
orp_tfs_dt[og!="", tf_family := str_remove(og, "\\.(?<=\\.).+")]  
orp_tfs_dt[, tf_family := str_replace_all(tf_family, "AP2", "AP-2")]
# summarize
orp_tfs_dtn <- orp_tfs_dt[, .N,tf_family][
  , prop := N / sum(N)][
    order(-N)]
orp_tfs_dtn_plot <- copy(orp_tfs_dtn)
orp_tfs_dtn_plot <- unique(orp_tfs_dtn_plot)
orp_tfs_dtn_plot[, tf_family := factor(
  tf_family, 
  levels = orp_tfs_dtn_plot$tf_family
)]

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
# missing motifs
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
orp_arc_dt <- unique(dict[archetype_name %in% orp_arc])
# get family info from archetype name
orp_arc_dt[tf_family == "", tf_family := str_extract(
  archetype_name, paste(tf_fams[[1]], collapse = "|")
)]
# get family info from motifs names
orp_arc_na <- orp_arc_dt[is.na(tf_family), ][
  , tf_family := str_extract(motif,paste(tf_fams[[1]],collapse="|"))][
    , .(archetype,tf_family)][
      !is.na(tf_family)]
orp_arc_vc <- structure(orp_arc_na$tf_family, names = orp_arc_na$archetype)
orp_arc_dt[archetype %in% names(orp_arc_vc), tf_family := orp_arc_vc[archetype]]
# guess family info from archetype names
grep_list2 <- list(
  "fox" = "Forkhead",
  "hox" = "Homeodomains",
  "sox" = "HMGbox_Sox",
  "runx|runt" = "Runt_Runx",
  "mads|srf" = "MADS-box_SRF",
  "Myb_DNA-bind" = "Myb"
)
nms <- unlist(strsplit(tf_fams[[2]], ","))
fms <- unname(unlist(lapply(1:nrow(tf_fams), function(i) 
  rep(tf_fams[i,1], length(strsplit(as.character(tf_fams[i,2]), ",")[[1]]))
)))
grep_list1 <- fms; names(grep_list1) <- nms
grep_list <- c(grep_list1, grep_list2)
for (gp in names(grep_list)) {
  orp_arc_dt[is.na(tf_family) & grepl(gp, archetype, ignore.case = TRUE),
  tf_family := grep_list[[gp]]]
}
orp_arc_dt[, tf_family := str_replace_all(tf_family, "AP2", "AP-2")]
orp_arc_dt[is.na(tf_family), tf_family := "unknown"]
orp_arc_dtn <- unique(orp_arc_dt[, .(archetype, tf_family)])
# summarize with unknown
orp_arc_dtn_1 <- orp_arc_dt[
  , .N, tf_family][
    , prop := N / sum(N)][order(-N)]
orp_arc_dtn_1[, tf_family := factor(tf_family, levels = orp_arc_dtn_1$tf_family)]
# summarize without unknown
orp_arc_dtn_2 <- orp_arc_dt[tf_family != "unknown"][
  , .N, tf_family][
    , prop := N / sum(N)][order(-N)]
orp_arc_dtn_2[, tf_family := factor(tf_family, levels = orp_arc_dtn_2$tf_family)]

# colors
tf_fams <- unique(sort(c(tf_fams[[1]], unique(orp_tfs_dt$tf_family))))
tf_fams <- tf_fams[tf_fams %in% c(as.character(orp_tfs_dtn$tf_family), as.character(orp_arc_dt$tf_family))]
tf_fams <- str_replace_all(tf_fams, "AP2", "AP-2")
tf_fams_cols <- structure(
  c(colorRampPalette(RColorBrewer::brewer.pal(8,'Paired'))(length(tf_fams)), "khaki", "grey"),
  names = c(tf_fams, "other", "unknown")
)

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
# plots
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
gp_orp_tfs <- ggplot(orp_tfs_dtn_plot, aes("", N, fill = tf_family)) +
  geom_bar(stat = "identity", width = 1, color = "black") +
  coord_polar("y", start=0) +
  scale_fill_manual(values = tf_fams_cols, limits = force) +
  geom_text(
    aes(label = ifelse(
      N < 1,
      "",
      sprintf("%s (%s)", tf_family, N)
    ), x = 1.4),
    position = position_stack(vjust = 0.5),
    color = "black"
  ) +
  labs(title = "orphan TFs") +
  theme_void() +
  theme(legend.position = "none")

gp_orp_arc_1 <- ggplot(orp_arc_dtn_1, aes("", N, fill = tf_family)) +
  geom_bar(stat="identity", width = 1, color = "black") +
  coord_polar("y", start = 0) +
  scale_fill_manual(values = tf_fams_cols, limits = force) +
  geom_text(
    aes(label = ifelse(
      N < 10,
      "",
      sprintf("%s (%s)", tf_family, N)
    ), x = 1.4),
    position = position_stack(vjust = 0.5),
    color = "black"
  ) +
  labs(title = "orphan motif archetypes") +
  theme_void() +
  theme(legend.position = "none")

gp_orp_arc_2 <- ggplot(orp_arc_dtn_2, aes("", N, fill = tf_family)) +
  geom_bar(stat="identity", width = 1, color = "black") +
  coord_polar("y", start = 0) +
  scale_fill_manual(values = tf_fams_cols, limits = force) +
  geom_text(
    aes(label = ifelse(
      N < 1,
      "",
      sprintf("%s (%s)", tf_family, N)
    ), x = 1.4),
    position = position_stack(vjust = 0.5),
    color = "black"
  ) +
  theme_void() +
  labs(title = "(without unknown)") +
  theme(legend.position = "none")

# save
unmatched_lst <- list(
  tfs = orp_tfs_dt,
  motifs = orp_arc_dt
)
saveRDS(unmatched_lst, file.path(
  res_dir, "unmatched-tfs-motifs-2.rds"
))

# plot
gp_orp_tfs + gp_orp_arc_1 + gp_orp_arc_2

orp_dtn <- rbindlist(list(
  "TFs" = orp_tfs_dtn,
  "archetypes" = orp_arc_dtn_1
), idcol = "variable")
orp_dtn <- orp_dtn[tf_family != "unknown"]

gp_orp <- ggplot(orp_dtn, aes(tf_family)) + 
  geom_bar(
    data = subset(orp_dtn, variable == "TFs"), 
    aes(y = N, fill = tf_family), 
    stat = "identity",
    position = "dodge") +
  geom_bar(
    data = subset(orp_dtn, variable == "archetypes"), 
    aes(y = -N, fill = tf_family), 
    stat = "identity",
    position = "dodge"
  ) + 
  coord_flip() +
  scale_fill_manual(values = tf_fams_cols, limits = force) +
  scale_y_continuous(limits = c(NA, NA), labels = abs) +
  geom_hline(yintercept = 0, colour = "black", size = 0.25) +
  theme(
    legend.position = "none", 
    axis.line = element_blank(),
    panel.grid.major.x = element_line(size=0.5)
  ) +
  labs(y = "TFs \t motifs", x = "TF family")
gp_orp

Motif enrichment heatmap

# enrichment scores
dt <- fread(
  file.path(res_dir, "motif-enrich-archetypes-mona.tsv")
)
setnames(dt, "label", "group")

# qvalue
dat_qv <- dcast.data.table(
  unique(dt[, .(motif, group, padj)]), motif ~ group, value.var = "padj"
)
mat_qv <- data.matrix(dat_qv)[,-1]
mat_qv[is.na(mat_qv)] <- 1
rownames(mat_qv) <- dat_qv$motif
write.table(
  mat_qv,
  file.path(res_dir, "motif-enrich-archetypes-mona-qvalue.tsv"),
  sep = "\t", quote = FALSE
)

# log2fc
dat_fc <- dcast.data.table(
  unique(dt[,.(motif, group, fc)]), motif ~ group, value.var = "fc"
)
mat_fc <- data.matrix(dat_fc)[,-1]
mat_fc[is.na(mat_fc)] <- 0
rownames(mat_fc) <- dat_fc$motif
write.table(
  mat_fc,
  file.path(res_dir, "motif-enrich-archetypes-mona-fc.tsv"),
  sep = "\t", quote = FALSE
)

# convert to numeric (replacing , with .) and multiply with -1 (so that highly significant = higher value)
dt[, minuslog10qval := -1*log10(padj)]

# filtering
ids <- apply(mat_fc, 1, function(x) max(x) > 1) &
  apply(mat_qv, 1, function(x) !is.infinite(min(x)) & !min(x) > 0.001)

# clustering
hc <- hclust(dist(mat_fc[ids, ]), method = "ward.D2")
ds <- dt[motif %in% rownames(mat_fc)[ids]]
ds[, motif := factor(motif, levels = rev(rownames(mat_fc)[ids]))]

# ordering
mord <- order(apply(mat_fc[ids,], 1, which.max))
ds[, motif := factor(motif, levels=rev(rownames(mat_fc[ids,])[mord]))]

# limit -log10FDR range 
ds[, minuslog10qval := pmin(minuslog10qval, 20)]

# add archetype gene annotations
dict <- fread(file.path(
  res_dir,
  "motif-archetypes-PPM-PCCnorm-0.8-IC0.5-8bp-mapped.dict"
))
dc <- unique(dict[,.(archetype_name, gene)])
setnames(dc, "archetype_name", "motif")
mt_dt <- merge.data.table(
  ds, dc, 
  by = "motif",
  all.x = TRUE
)

# add tf annotations
marks_gold <- fread(
  file.path("annotation", "golden-marks-230116.tsv"),
  fill = TRUE
)[, 1:2]
setnames(marks_gold, c("gene", "name"))
marks_gold[, name := str_remove(name, "^Nv")]

tfs_annt <- fread(
  file.path("annotation", "curated_TFh_Nvec_DToL_names.tsv"),
  header = FALSE
)
setnames(tfs_annt, c("gene", "pfam", "og"))

marks_tfs <- merge.data.table(
  tfs_annt,
  marks_gold,
  by = "gene", all = TRUE, sort = FALSE
)
marks_tfs[is.na(marks_tfs)] <- ""

mt_marks_dt <- merge.data.table(
  mt_dt, marks_tfs,
  by = "gene",
  all.x = TRUE
)
mt_marks_dt[is.na(name), name := ""]
mt_marks_dt[name == "" & pfam != "", name := pfam]
mt_marks_dt[name == "", name := motif]
mt_marks_dt[, motif := factor(motif, levels = levels(ds$motif))]

# add labels
mt_marks_dt[, label := str_extract(name, "(?<=BestGuess_).+")]
mt_marks_dt[is.na(label), label := str_extract(name, "(?<=ARCH\\d{1,3}_).+")]
mt_marks_dt[is.na(label), label := name]
mt_marks_dt[!(pval < 1e5 & fc > 4), label := NA]

# filter labels by RNA plot labels
labels_dt <- marks_tfs[name != ""][, .(name, gene)]
labels_v <- structure(labels_dt$name, names = labels_dt$gene)
mt_marks_dt[is.na(label), label := labels_v[gene]]
setorder(mt_marks_dt,  -minuslog10qval)
mt_marks_dt[, i:=1:.N, .(gene,motif)]
mt_marks_dt[i > 1, label := NA]
# mt_marks_dt[!(pval < 1e5 & fc > 3), label := NA]

# plot
labels_p <- function(
  data,
  column_name,
  padj_thrs = 1e-3,
  fc_thrs = 4,
  operation = c("|","&")[1]
) {
  return(
    function(m) {
      dl <- copy(data)
      dl[, column_name := dl[[column_name]]]
      dl <- dl[order(padj)][, .SD[1], .(column_name)]
      dl <- dl[match(m, column_name)]
      if (any(c("|", "OR", "or", "union") %in% operation[1])) {
        m[dl$padj < padj_thrs | dl$fc > fc_thrs]
      } else if (any(c("&", "AND", "and", "intersect") %in% operation[1])) {
        m[dl$padj < padj_thrs & dl$fc > fc_thrs]
      }
    }
  )
}

gp <- ggplot(mt_marks_dt, aes(group, motif, label = label)) +
  geom_point(
    aes(size = minuslog10qval, fill = fc),
    shape = 21,
    color = "black"
  ) + 
  scale_fill_gradientn(
    name = "enrichmnet\nfold change",
    colours = c("white","#fee5d9","#fcae91","#fb6a4a","#de2d26","#a50f15", "#7a0105"),
    breaks = c(0, 2, 4, 6)
  ) +
  scale_size_continuous(
    name = "-log10 FDR", 
    breaks = c(0, 10, 20)
  ) +
  #scale_y_discrete(
  #  breaks = labels_p(
  #    data = mt_marks_dt,
  #    column_name = "motif",
  #    padj_thrs = 1e-3,
  #    fc_thrs = 2,
  #    operation = "&"
  #  )
  #) +
  geom_text_repel(
    force = 0.5,
    nudge_x = -0.25,
    direction = "y",
    hjust = 1,
    segment.size = 0.2
  ) +
  theme(
    panel.grid.major = element_line(size = 0.25),
    axis.text.y = element_text(size = 8)
  )
gp
## Warning: Removed 1598 rows containing missing values (geom_text_repel).

Footprinting

Merge replicates per condition

bam_dir="ATACSEQ/nucleosome_free_regions/bam/"
nth=18

for line in Fox Elav Ncol
do
  for cond in pos neg
  do
    name=${line}_${cond}
    echo ${name}
    bams=$( echo ${bam_dir}/${name}*bam )
    samtools merge -@ ${nth} ${bam_dir}/${name}.ncfree.bam ${bams}
    samtools sort -@ ${nth} -o ${bam_dir}/${name}.ncfree.sorted.bam ${bam_dir}/${name}.ncfree.bam
    rm ${bam_dir}/${name}.ncfree.bam
    samtools index -@ ${nth} ${bam_dir}/${name}.ncfree.sorted.bam
  done
done

Footprint score calculation for consensus peaks

conda activate TOBIAS_ENV
bam_dir="ATACSEQ/nucleosome_free_regions/bam/"
out_dir="ATACSEQ/nucleosome_free_regions/footprint/"
mkdir ${out_dir}
peaks="ATACSEQ/nucleosome_free_regions/consensus_peaks/consensusSeekeR-peaks.bed"
genome="genome/Nvec_vc1.1_gDNA.fasta"
nth=12

for line in Fox Elav Ncol
do
  for cond in pos neg
  do
    name=${line}_${cond}
    
    echo $(date) "- Starting ATACorrect for" ${name}
    TOBIAS ATACorrect \
      --bam ${bam_dir}/${name}.ncfree.sorted.bam \
      --genome ${genome} \
      --peaks ${peaks} \
      --prefix ${name} \
      --outdir ${out_dir}/ATACorrect \
      --cores ${nth}
    
    echo $(date) "- Starting FootprintScores"
    TOBIAS FootprintScores \
      --signal ${out_dir}/ATACorrect/${name}_corrected.bw \
      --regions ${peaks} \
      --fp-min 10 --fp-max 50 \
      --output ${out_dir}/${name}_footprints.bw \
      --cores ${nth}
    
  done
done

Plot difference in footprints

conda activate TOBIAS_ENV
bam_dir="ATACSEQ/nucleosome_free_regions/bam/"
bwg_dir="ATACSEQ/nucleosome_free_regions/footprint/ATACorrect"
plt_dir="ATACSEQ/nucleosome_free_regions/footprint/plots"
mkdir ${plt_dir}
peaks="ATACSEQ/nucleosome_free_regions/consensus_peaks/consensusSeekeR-peaks.bed"

for line in Fox Elav Ncol
do
  TOBIAS PlotAggregate --TFBS ${peaks} \
    --signals ${bwg_dir}/${line}_pos_corrected.bw ${bwg_dir}/${line}_neg_corrected.bw \
    --output ${plt_dir}/${line}_footprint_comparison_all_peaks.pdf \
    --flank 125 \
    --share_y both \
    --plot_boundaries \
    --signal-on-x
done

TOBIAS PlotAggregate --TFBS ${peaks} \
  --signals ${bwg_dir}/Elav_pos_corrected.bw ${bwg_dir}/Fox_pos_corrected.bw ${bwg_dir}/Ncol_pos_corrected.bw \
  --output ${plt_dir}/pos_footprint_comparison_all_peaks.pdf \
  --flank 150 \
  --share_y both \
  --plot_boundaries \
  --signal-on-x

Combine motif scores and footprint scores to determine motif binding.

mts_fn <- file.path(res_dir, "motif-scores-archetypes-mona-annot.tsv")
ftp_fn <- list.files(ftp_dir, pattern = "bw$", full.names = TRUE)

bin_det <- mta_binding_detect(
  mts_fn = mts_fn,
  ftp_fn = ftp_fn,
  pks_smpl = 1e6,
  pval = 1e-3,
  seed = 1950 
)

bnd_dt <- bin_det$motifs_df
bnd_dt[, sample := str_remove(sample, "_footprints")]
bnd_gr <- makeGRangesFromDataFrame(bnd_dt, keep.extra.columns = TRUE)
seqnames(bnd_gr) <- seqnames(BSgenome.jaNemVect1.1.DToL.Assembly)
bnd_gr_r <- GenomicRanges::restrict(bnd_gr)
bnd_dt <- as.data.table(bnd_gr_r)
fwrite(
  bnd_dt,
  file.path(res_dir, "motif-scores-archetypes-mona-annot-footprints.tsv.gz")
)

bkg_dist <- bin_det$background_dist
bkg_dt <- rbindlist(lapply(names(bkg_dist), function(smp) {
  x <- bkg_dist[[smp]]
  y <- bnd_dt[sample == smp]$ftp_threshold[1]
  z <- str_remove(smp, "_footprint")
  data.table(
    x = rnorm(1e3, mean = x$estimate[1], sd = x$estimate[2])
  )[, threshold := y][, sample := z]
}))
bkg_dt[, reporterline := str_extract(sample, "Elav|Fox|Ncol")]
fwrite(
  bkg_dt,
  file.path(res_dir, "motif-footprints-background.tsv.gz")
)

Motif hits are labeled as bound or unbouned based on threshold determined from background distribution of footprint scores (p value of normal fit to background = 0.001).

# background footprint scores
bkg_dt <- fread(file.path(res_dir, "motif-footprints-background.tsv.gz"))
gp_bckg <- ggplot(bkg_dt, aes(x, color = reporterline)) +
  geom_density() +
  scale_color_manual(values = line_cols) +
  scale_y_continuous(expand = expansion(mult = c(0,0.01))) +
  geom_vline(
    data = unique(bkg_dt[,.(threshold, sample, reporterline)]),
    aes(xintercept = threshold, color = reporterline), 
    size = 2, alpha = 0.2
  ) +
  labs(x = "background footprint scores") +
  theme(
    legend.position = "bottom"
  )

# foreground footprint scores
bnd_dt <- fread(file.path(res_dir, "motif-scores-archetypes-mona-annot-footprints.tsv.gz"))
bnd_dt[, reporterline := str_extract(sample, "Elav|Fox|Ncol")]
bnd_lab <- c("TRUE" = "bound", "FALSE" = "unbound")
gp_frwd <- ggplot(bnd_dt, aes(ftp_score, color = reporterline)) +
  geom_density() +
  scale_color_manual(values = line_cols) +
  scale_y_continuous(expand = expansion(mult = c(0,0.01))) +
  scale_x_continuous(limits = c(NA, 200), oob = function(x, limits) x) +
  geom_vline(
    data = unique(bnd_dt[,.(ftp_threshold, sample, reporterline)]),
    aes(xintercept = ftp_threshold, color = reporterline), 
    size = 2, alpha = 0.2
  ) +
  labs(x = "foreground footprint scores") +
  theme(
    legend.position = "bottom"
  ) +
  facet_grid(bound ~ ., scales = "free_y", labeller = as_labeller(bnd_lab))

gp_bnd <- (gp_bckg / gp_frwd & theme(legend.position = "bottom")) + 
  plot_layout(guides = "collect", heights = c(1, 2))
gp_bnd

Save bed files for bound and unbound sites

bnd_dt <- fread(file.path(res_dir, "motif-scores-archetypes-mona-annot-footprints.tsv.gz"))
bnd_dt[, reporterline := str_extract(sample, "Elav|Fox|Ncol")]

bin_dir <- file.path(ftp_dir, "BINDetect")
dir.create(bin_dir, showWarnings = FALSE)

bed_cols <- c("seqnames", "start", "end", "motif", "ftp_score", "strand")
for (smp in unique(bnd_dt$sample)) {

  message(smp)
  bnd_smp <- bnd_dt[sample == smp]
  dir.create(file.path(bin_dir, smp), showWarnings = FALSE)
  
  # all motif sites
  fwrite(
    unique(bnd_smp[, ..bed_cols]),
    file.path(bin_dir, smp, sprintf("%s_all.bed", smp)),
    col.names = FALSE, sep ="\t"
  )
  # all bound motif sites
  fwrite(
    unique(bnd_smp[bound == TRUE][, ..bed_cols]),
    file.path(bin_dir, smp, sprintf("%s_all_bound.bed", smp)),
    col.names = FALSE, sep ="\t"
  )
  # all unbound motif sites
  fwrite(
    unique(bnd_smp[bound == FALSE][, ..bed_cols]),
    file.path(bin_dir, smp, sprintf("%s_all_unbound.bed", smp)),
    col.names = FALSE, sep ="\t"
  )

  for (mot in unique(bnd_smp$motif)) {
    # message(mot)
    bmt_dt <- bnd_smp[motif == mot]
    mot <- str_replace_all(mot, c("/" = "_"))
    dir.create(file.path(bin_dir, smp, mot), showWarnings = FALSE)

    # all motif sites
    fwrite(
      unique(bmt_dt[, ..bed_cols]),
      file.path(bin_dir, smp, mot, sprintf("%s_%s_all.bed", smp, mot)),
      col.names = FALSE, sep ="\t"
    )
    # all bound motif sites
    fwrite(
      unique(bmt_dt[bound == TRUE][, ..bed_cols]),
      file.path(bin_dir, smp, mot, sprintf("%s_%s_all_bound.bed", smp, mot)),
      col.names = FALSE, sep ="\t"
    )
    # all unbound motif sites
    fwrite(
      unique(bmt_dt[bound == FALSE][, ..bed_cols]),
      file.path(bin_dir, smp, mot, sprintf("%s_%s_all_unbound.bed", smp, mot)),
      col.names = FALSE, sep ="\t"
    )
  }

}

Plot difference in footprint for bound and unbound sites

conda activate TOBIAS_ENV
bam_dir="ATACSEQ/nucleosome_free_regions/bam/"
bwg_dir="ATACSEQ/nucleosome_free_regions/footprint/ATACorrect"
bed_dir="ATACSEQ/nucleosome_free_regions/footprint/BINDetect"
plt_dir="ATACSEQ/nucleosome_free_regions/footprint/plots"
mkdir ${plt_dir}
cond="pos"
for line in Elav Fox Ncol
do
  TOBIAS PlotAggregate \
    --TFBS ${bed_dir}/${line}_${cond}/${line}_${cond}_all.bed \
            ${bed_dir}/${line}_${cond}/${line}_${cond}_all_bound.bed \
            ${bed_dir}/${line}_${cond}/${line}_${cond}_all_unbound.bed \
    --signals \
      ${bwg_dir}/${line}_pos_corrected.bw \
      ${bwg_dir}/${line}_neg_corrected.bw \
    --output ${plt_dir}/${line}_footprint_comparison_bound_peaks.pdf \
    --flank 125 \
    --share_y sites \
    --plot_boundaries
done

Plot the same for selected motifs

conda activate TOBIAS_ENV
bam_dir="ATACSEQ/nucleosome_free_regions/bam/"
bwg_dir="ATACSEQ/nucleosome_free_regions/footprint/ATACorrect"
bed_dir="ATACSEQ/nucleosome_free_regions/footprint/BINDetect"
plt_dir="ATACSEQ/nucleosome_free_regions/footprint/plots"
mkdir ${plt_dir}


mot="ARCH551_POU4"
cond="pos"
line1="Elav"
line2="Fox"
line3="Ncol"
TOBIAS PlotAggregate \
  --TFBS \
    ${bed_dir}/${line}_${cond}/${mot}/${line}_${cond}_${mot}_all_bound.bed \
    ${bed_dir}/${line}_${cond}/${mot}/${line}_${cond}_${mot}_all_unbound.bed \
  --signals \
    ${bwg_dir}/${line1}_pos_corrected.bw \
    ${bwg_dir}/${line2}_pos_corrected.bw \
    ${bwg_dir}/${line3}_pos_corrected.bw \
  --output ${plt_dir}/${line}_${mot}_footprint_comparison_bound_peaks.pdf \
  --flank 125 \
  --share_y sites \
  --plot_boundaries